Fetch popular: flat-playlist crawl then parallel view_count enrichment

Phase 1: crawl the full channel with flat-playlist to store any videos not yet in DB (fast, no individual requests). Phase 2: fetch real view_count for up to 200 channel videos in parallel (8 workers), prioritising those missing a count. Popular tab sorts all channel videos by view_count DESC NULLS LAST. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-26 23:05:21 +02:00
parent ff4d8e4ab4
commit 6e455ed8ce
1 changed files with 61 additions and 5 deletions
--- a/backend/routers/channels.py
+++ b/backend/routers/channels.py
@@ -657,19 +657,75 @@ def fetch_popular_videos(


 def _fetch_popular_task(channel_id: int, youtube_channel_id: str):
-    """Enrich indexed videos with view_count so Popular tab can rank them.
+    """Half-and-half popular fetch.

-    YouTube's ?sort=p is broken in yt-dlp (their own tests mark it skipped).
-    Instead we fetch real view counts for all indexed videos via individual
-    page requests and sort by view_count DESC locally.
+    Phase 1 (fast): flat-playlist crawl of the full channel → store any
+    new videos in DB (title, duration, thumbnail). No individual requests.
+
+    Phase 2 (parallel): for every video now in DB for this channel,
+    fetch its watch page to get real view_count + published_at.
    Prioritises videos missing view_count; caps at 200 per run.
+    Popular tab then sorts by view_count DESC.
    """
    from ..database import SessionLocal
    from concurrent.futures import ThreadPoolExecutor, as_completed

+    # Phase 1 — flat-playlist: crawl all channel videos quickly
+    if youtube_channel_id.startswith("@"):
+        url = f"https://www.youtube.com/{youtube_channel_id}/videos"
+    else:
+        url = f"https://www.youtube.com/channel/{youtube_channel_id}/videos"
+
+    stdout, _, _ = ytdlp._run([
+        "yt-dlp", url,
+        "--dump-json", "--flat-playlist",
+        "--quiet",
+        *ytdlp._cookie_args(),
+    ], timeout=120)
+
+    flat_entries = []
+    for line in stdout.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            info = json.loads(line)
+            yt_id = info.get("id")
+            if yt_id:
+                flat_entries.append({
+                    "id": yt_id,
+                    "title": info.get("title", ""),
+                    "duration": info.get("duration"),
+                })
+        except json.JSONDecodeError:
+            continue
+
+    # Store any new videos from the flat crawl
+    if flat_entries:
+        db = SessionLocal()
+        try:
+            channel = db.query(Channel).filter_by(id=channel_id).first()
+            if channel:
+                for entry in flat_entries:
+                    if not db.query(Video).filter_by(youtube_video_id=entry["id"]).first():
+                        try:
+                            db.add(Video(
+                                youtube_video_id=entry["id"],
+                                channel_id=channel_id,
+                                title=entry["title"],
+                                thumbnail_url=ytdlp._stable_thumbnail(entry["id"]),
+                                duration_seconds=entry["duration"],
+                                tags="[]",
+                            ))
+                            db.commit()
+                        except Exception:
+                            db.rollback()
+        finally:
+            db.close()
+
+    # Phase 2 — individual fetches for view_count, prioritising missing ones
    db = SessionLocal()
    try:
-        # Videos without view_count first, then those with stale counts
        rows = db.execute(
            text("""
                SELECT youtube_video_id FROM videos