Fix hidden yt-dlp calls on video page causing cookie invalidation

Two background yt-dlp processes were firing every time a video page loaded: 1. importChapters (called unconditionally via useEffect on mount) was calling _upsert_video_from_yt with polite=False when chapters=NULL — no rate limiter, no download-pause check, runs concurrently with active downloads. Fix: return [] immediately when chapters=NULL and let the normal enrichment pipeline (already polite=True) fill them in. 2. get_video_by_yt_id schedules a background _enrich task whenever description or chapters are NULL. The frontend polls every 3 s while description is null, so dozens of enrichment tasks would pile up for the same video. Fix: deduplicate with _enriching set — only one background fetch per video_id at a time; the set entry is cleared when the task finishes. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 02:43:45 +02:00
parent 146a044e69
commit 1179b53f2e
1 changed files with 29 additions and 19 deletions
--- a/backend/routers/videos.py
+++ b/backend/routers/videos.py
@@ -1,5 +1,6 @@
 import os
 import random
+import threading
 from datetime import datetime
 from typing import Optional

@@ -17,6 +18,11 @@ from ..services.scoring import get_surprise_videos, get_discovery_injection

 router = APIRouter()

+# Tracks which video IDs currently have a background enrichment running,
+# so repeated polls from the frontend don't spawn duplicate yt-dlp calls.
+_enriching: set[str] = set()
+_enriching_lock = threading.Lock()
+

 def _update_affinity(db: Session, user_id: int, video: Video, delta: float):
    """Adjust tag/category affinity scores for a video. delta > 0 = positive signal."""
@@ -700,18 +706,12 @@ def import_chapters(
    import json as _json

    video = db.query(Video).filter(Video.id == video_id).first()
-    if not video:
+    if not video or video.chapters is None:
+        # chapters=NULL means enrichment hasn't run yet; the background fetch
+        # triggered by get_video_by_yt_id will fill this in. Don't call yt-dlp
+        # here — it runs polite=False and races with active downloads.
        return []

-    # chapters=NULL means never fetched; fetch now and cache the result (even if empty)
-    if video.chapters is None:
-        _upsert_video_from_yt(db, video.youtube_video_id)
-        db.refresh(video)
-        # Mark as checked even if no chapters found, so we don't re-fetch next time
-        if video.chapters is None:
-            video.chapters = "[]"
-            db.commit()
-
    chapters = _json.loads(video.chapters or "[]")
    # Skip if trivial (single chapter) or already imported
    if len(chapters) < 2:
@@ -935,7 +935,15 @@ def get_video_by_yt_id(
        # Video unknown — must block to get at least a title before we can render anything
        _upsert_video_from_yt(db, youtube_video_id)
    elif existing.description is None or existing.chapters is None:
-        # Video known but missing enrichment — fetch in background, return immediately
+        # Video known but missing enrichment — schedule one background fetch.
+        # The frontend polls every 3 s while description is null; without the
+        # dedup guard each poll would spawn its own yt-dlp process.
+        with _enriching_lock:
+            already = youtube_video_id in _enriching
+            if not already:
+                _enriching.add(youtube_video_id)
+
+        if not already:
            from ..database import SessionLocal
            def _enrich(yt_id: str):
                bg_db = SessionLocal()
@@ -943,6 +951,8 @@ def get_video_by_yt_id(
                    _upsert_video_from_yt(bg_db, yt_id, polite=True)
                finally:
                    bg_db.close()
+                    with _enriching_lock:
+                        _enriching.discard(yt_id)
            background_tasks.add_task(_enrich, youtube_video_id)

    row = db.execute(