Fetch popular: flat-playlist crawl then parallel view_count enrichment

Phase 1: crawl the full channel with flat-playlist to store any videos not yet in DB (fast, no individual requests). Phase 2: fetch real view_count for up to 200 channel videos in parallel (8 workers), prioritising those missing a count. Popular tab sorts all channel videos by view_count DESC NULLS LAST. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-26 23:05:21 +02:00
parent ff4d8e4ab4
commit 6e455ed8ce
1 changed files with 61 additions and 5 deletions
--- a/backend/routers/channels.py
+++ b/backend/routers/channels.py
@@ -657,19 +657,75 @@ def fetch_popular_videos(
 def _fetch_popular_task(channel_id: int, youtube_channel_id: str):
-    """Enrich indexed videos with view_count so Popular tab can rank them.
+    """Half-and-half popular fetch.
-    YouTube's ?sort=p is broken in yt-dlp (their own tests mark it skipped).
+    Phase 1 (fast): flat-playlist crawl of the full channel → store any
-    Instead we fetch real view counts for all indexed videos via individual
+    new videos in DB (title, duration, thumbnail). No individual requests.
-    page requests and sort by view_count DESC locally.
+
    Phase 2 (parallel): for every video now in DB for this channel,
    fetch its watch page to get real view_count + published_at.
    Prioritises videos missing view_count; caps at 200 per run.
    Popular tab then sorts by view_count DESC.
    """
    from ..database import SessionLocal
    from concurrent.futures import ThreadPoolExecutor, as_completed
    # Phase 1 — flat-playlist: crawl all channel videos quickly
    if youtube_channel_id.startswith("@"):
        url = f"https://www.youtube.com/{youtube_channel_id}/videos"
    else:
        url = f"https://www.youtube.com/channel/{youtube_channel_id}/videos"
    stdout, _, _ = ytdlp._run([
        "yt-dlp", url,
        "--dump-json", "--flat-playlist",
        "--quiet",
        *ytdlp._cookie_args(),
    ], timeout=120)
    flat_entries = []
    for line in stdout.splitlines():
        line = line.strip()
        if not line:
            continue
        try:
            info = json.loads(line)
            yt_id = info.get("id")
            if yt_id:
                flat_entries.append({
                    "id": yt_id,
                    "title": info.get("title", ""),
                    "duration": info.get("duration"),
                })
        except json.JSONDecodeError:
            continue
    # Store any new videos from the flat crawl
    if flat_entries:
        db = SessionLocal()
        try:
            channel = db.query(Channel).filter_by(id=channel_id).first()
            if channel:
                for entry in flat_entries:
                    if not db.query(Video).filter_by(youtube_video_id=entry["id"]).first():
                        try:
                            db.add(Video(
                                youtube_video_id=entry["id"],
                                channel_id=channel_id,
                                title=entry["title"],
                                thumbnail_url=ytdlp._stable_thumbnail(entry["id"]),
                                duration_seconds=entry["duration"],
                                tags="[]",
                            ))
                            db.commit()
                        except Exception:
                            db.rollback()
        finally:
            db.close()
    # Phase 2 — individual fetches for view_count, prioritising missing ones
    db = SessionLocal()
    try:
        # Videos without view_count first, then those with stale counts
        rows = db.execute(
            text("""
                SELECT youtube_video_id FROM videos