Slow down popular fetch and enrich to protect cookies

- Popular fetch phase 2: sequential with 2s delay between requests (was 3 parallel workers) - Reduced from 200 to 100 videos per popular fetch run - DB writes happen after each video instead of all at end (no data loss on interrupt) - _enrich_missing_task: delay increased 0.5s → 2s between requests Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-26 23:53:37 +02:00
parent ff601d3585
commit 8029b2517f
1 changed files with 31 additions and 38 deletions
--- a/backend/routers/channels.py
+++ b/backend/routers/channels.py
@@ -210,7 +210,7 @@ def _enrich_missing_task(limit: int = 20):
        ).mappings().all()
        for i, row in enumerate(rows):
            if i > 0:
-                time.sleep(0.5)
+                time.sleep(2.0)
            try:
                meta = ytdlp.fetch_video_metadata(row["youtube_video_id"])
                if meta:
@@ -723,13 +723,12 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
    Phase 1 (fast): flat-playlist crawl of the full channel → store any
    new videos in DB (title, duration, thumbnail). No individual requests.

-    Phase 2 (parallel): for every video now in DB for this channel,
-    fetch its watch page to get real view_count + published_at.
-    Prioritises videos missing view_count; caps at 200 per run.
-    Popular tab then sorts by view_count DESC.
+    Phase 2 (sequential, polite): fetch each video's watch page one at a time
+    with a 2-second pause between requests to avoid cookie invalidation.
+    Prioritises videos missing view_count; caps at 100 per run.
    """
+    import time
    from ..database import SessionLocal
-    from concurrent.futures import ThreadPoolExecutor, as_completed

    task_id = f"popular-{channel_id}"
    label = f"Popular fetch — {channel_name}" if channel_name else "Popular fetch"
@@ -797,7 +796,7 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
        finally:
            db.close()

-    # Phase 2 — individual fetches for view_count, prioritising missing ones
+    # Phase 2 — sequential fetches with a polite delay to avoid cookie invalidation
    db = SessionLocal()
    try:
        rows = db.execute(
@@ -805,7 +804,7 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
                SELECT youtube_video_id FROM videos
                WHERE channel_id = :cid
                ORDER BY (view_count IS NULL) DESC, RANDOM()
-                LIMIT 200
+                LIMIT 100
            """),
            {"cid": channel_id},
        ).mappings().all()
@@ -824,41 +823,35 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
            _tasks[task_id]["total"] = len(video_ids)
            _tasks[task_id]["done"] = 0

-    results = {}
    try:
-        with ThreadPoolExecutor(max_workers=3) as pool:
-            futures = {pool.submit(ytdlp.fetch_video_metadata, vid): vid for vid in video_ids}
-            for future in as_completed(futures):
-                vid = futures[future]
-                try:
-                    results[vid] = future.result()
-                except Exception:
-                    pass
-                with _tasks_lock:
-                    if task_id in _tasks:
-                        _tasks[task_id]["done"] += 1
+        for i, yt_id in enumerate(video_ids):
+            if i > 0:
+                time.sleep(2.0)
+            try:
+                meta = ytdlp.fetch_video_metadata(yt_id)
+                if meta:
+                    db = SessionLocal()
+                    try:
+                        vid = db.query(Video).filter_by(youtube_video_id=yt_id).first()
+                        if vid:
+                            if meta.get("view_count") is not None:
+                                vid.view_count = meta["view_count"]
+                            if not vid.published_at and meta.get("published_at"):
+                                vid.published_at = meta["published_at"]
+                            db.commit()
+                    except Exception:
+                        db.rollback()
+                    finally:
+                        db.close()
+            except Exception:
+                pass
+            with _tasks_lock:
+                if task_id in _tasks:
+                    _tasks[task_id]["done"] += 1
    finally:
        with _tasks_lock:
            _tasks.pop(task_id, None)

-    db = SessionLocal()
-    try:
-        for yt_id, meta in results.items():
-            if not meta:
-                continue
-            try:
-                vid = db.query(Video).filter_by(youtube_video_id=yt_id).first()
-                if vid:
-                    if meta.get("view_count") is not None:
-                        vid.view_count = meta["view_count"]
-                    if not vid.published_at and meta.get("published_at"):
-                        vid.published_at = meta["published_at"]
-                    db.commit()
-            except Exception:
-                db.rollback()
-    finally:
-        db.close()
-

@router.post("/{channel_id}/search", status_code=status.HTTP_202_ACCEPTED)
 def search_channel_youtube(