From 6e455ed8ce13280f1930185d7a75fcdcb71fb29f Mon Sep 17 00:00:00 2001 From: Mattias Thall Date: Tue, 26 May 2026 23:05:21 +0200 Subject: [PATCH] Fetch popular: flat-playlist crawl then parallel view_count enrichment Phase 1: crawl the full channel with flat-playlist to store any videos not yet in DB (fast, no individual requests). Phase 2: fetch real view_count for up to 200 channel videos in parallel (8 workers), prioritising those missing a count. Popular tab sorts all channel videos by view_count DESC NULLS LAST. Co-Authored-By: Claude Sonnet 4.6 --- backend/routers/channels.py | 66 ++++++++++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 5 deletions(-) diff --git a/backend/routers/channels.py b/backend/routers/channels.py index 5f688b6..6e8252f 100644 --- a/backend/routers/channels.py +++ b/backend/routers/channels.py @@ -657,19 +657,75 @@ def fetch_popular_videos( def _fetch_popular_task(channel_id: int, youtube_channel_id: str): - """Enrich indexed videos with view_count so Popular tab can rank them. + """Half-and-half popular fetch. - YouTube's ?sort=p is broken in yt-dlp (their own tests mark it skipped). - Instead we fetch real view counts for all indexed videos via individual - page requests and sort by view_count DESC locally. + Phase 1 (fast): flat-playlist crawl of the full channel → store any + new videos in DB (title, duration, thumbnail). No individual requests. + + Phase 2 (parallel): for every video now in DB for this channel, + fetch its watch page to get real view_count + published_at. Prioritises videos missing view_count; caps at 200 per run. + Popular tab then sorts by view_count DESC. """ from ..database import SessionLocal from concurrent.futures import ThreadPoolExecutor, as_completed + # Phase 1 — flat-playlist: crawl all channel videos quickly + if youtube_channel_id.startswith("@"): + url = f"https://www.youtube.com/{youtube_channel_id}/videos" + else: + url = f"https://www.youtube.com/channel/{youtube_channel_id}/videos" + + stdout, _, _ = ytdlp._run([ + "yt-dlp", url, + "--dump-json", "--flat-playlist", + "--quiet", + *ytdlp._cookie_args(), + ], timeout=120) + + flat_entries = [] + for line in stdout.splitlines(): + line = line.strip() + if not line: + continue + try: + info = json.loads(line) + yt_id = info.get("id") + if yt_id: + flat_entries.append({ + "id": yt_id, + "title": info.get("title", ""), + "duration": info.get("duration"), + }) + except json.JSONDecodeError: + continue + + # Store any new videos from the flat crawl + if flat_entries: + db = SessionLocal() + try: + channel = db.query(Channel).filter_by(id=channel_id).first() + if channel: + for entry in flat_entries: + if not db.query(Video).filter_by(youtube_video_id=entry["id"]).first(): + try: + db.add(Video( + youtube_video_id=entry["id"], + channel_id=channel_id, + title=entry["title"], + thumbnail_url=ytdlp._stable_thumbnail(entry["id"]), + duration_seconds=entry["duration"], + tags="[]", + )) + db.commit() + except Exception: + db.rollback() + finally: + db.close() + + # Phase 2 — individual fetches for view_count, prioritising missing ones db = SessionLocal() try: - # Videos without view_count first, then those with stale counts rows = db.execute( text(""" SELECT youtube_video_id FROM videos