From 8029b2517f4cbed13824a1d527ca759c1c91e0c8 Mon Sep 17 00:00:00 2001 From: Mattias Thall Date: Tue, 26 May 2026 23:53:37 +0200 Subject: [PATCH] Slow down popular fetch and enrich to protect cookies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Popular fetch phase 2: sequential with 2s delay between requests (was 3 parallel workers) - Reduced from 200 to 100 videos per popular fetch run - DB writes happen after each video instead of all at end (no data loss on interrupt) - _enrich_missing_task: delay increased 0.5s → 2s between requests Co-Authored-By: Claude Sonnet 4.6 --- backend/routers/channels.py | 69 +++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 38 deletions(-) diff --git a/backend/routers/channels.py b/backend/routers/channels.py index 158054b..375f78b 100644 --- a/backend/routers/channels.py +++ b/backend/routers/channels.py @@ -210,7 +210,7 @@ def _enrich_missing_task(limit: int = 20): ).mappings().all() for i, row in enumerate(rows): if i > 0: - time.sleep(0.5) + time.sleep(2.0) try: meta = ytdlp.fetch_video_metadata(row["youtube_video_id"]) if meta: @@ -723,13 +723,12 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name: Phase 1 (fast): flat-playlist crawl of the full channel → store any new videos in DB (title, duration, thumbnail). No individual requests. - Phase 2 (parallel): for every video now in DB for this channel, - fetch its watch page to get real view_count + published_at. - Prioritises videos missing view_count; caps at 200 per run. - Popular tab then sorts by view_count DESC. + Phase 2 (sequential, polite): fetch each video's watch page one at a time + with a 2-second pause between requests to avoid cookie invalidation. + Prioritises videos missing view_count; caps at 100 per run. """ + import time from ..database import SessionLocal - from concurrent.futures import ThreadPoolExecutor, as_completed task_id = f"popular-{channel_id}" label = f"Popular fetch — {channel_name}" if channel_name else "Popular fetch" @@ -797,7 +796,7 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name: finally: db.close() - # Phase 2 — individual fetches for view_count, prioritising missing ones + # Phase 2 — sequential fetches with a polite delay to avoid cookie invalidation db = SessionLocal() try: rows = db.execute( @@ -805,7 +804,7 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name: SELECT youtube_video_id FROM videos WHERE channel_id = :cid ORDER BY (view_count IS NULL) DESC, RANDOM() - LIMIT 200 + LIMIT 100 """), {"cid": channel_id}, ).mappings().all() @@ -824,41 +823,35 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name: _tasks[task_id]["total"] = len(video_ids) _tasks[task_id]["done"] = 0 - results = {} try: - with ThreadPoolExecutor(max_workers=3) as pool: - futures = {pool.submit(ytdlp.fetch_video_metadata, vid): vid for vid in video_ids} - for future in as_completed(futures): - vid = futures[future] - try: - results[vid] = future.result() - except Exception: - pass - with _tasks_lock: - if task_id in _tasks: - _tasks[task_id]["done"] += 1 + for i, yt_id in enumerate(video_ids): + if i > 0: + time.sleep(2.0) + try: + meta = ytdlp.fetch_video_metadata(yt_id) + if meta: + db = SessionLocal() + try: + vid = db.query(Video).filter_by(youtube_video_id=yt_id).first() + if vid: + if meta.get("view_count") is not None: + vid.view_count = meta["view_count"] + if not vid.published_at and meta.get("published_at"): + vid.published_at = meta["published_at"] + db.commit() + except Exception: + db.rollback() + finally: + db.close() + except Exception: + pass + with _tasks_lock: + if task_id in _tasks: + _tasks[task_id]["done"] += 1 finally: with _tasks_lock: _tasks.pop(task_id, None) - db = SessionLocal() - try: - for yt_id, meta in results.items(): - if not meta: - continue - try: - vid = db.query(Video).filter_by(youtube_video_id=yt_id).first() - if vid: - if meta.get("view_count") is not None: - vid.view_count = meta["view_count"] - if not vid.published_at and meta.get("published_at"): - vid.published_at = meta["published_at"] - db.commit() - except Exception: - db.rollback() - finally: - db.close() - @router.post("/{channel_id}/search", status_code=status.HTTP_202_ACCEPTED) def search_channel_youtube(