Drastically reduce discovery yt-dlp call count: 64 → ~21

Each yt-dlp call is a separate subprocess that opens a new HTTP session with YouTube. 64 sessions in a row looks like a bot regardless of rate limiting. Changes: - crawl_by_search: 30 queries → 10 (top 5 tags, 4 channel names, 1 serendipity) - update_liked_signal: 10 queries → 4 - update_watch_signal: removed (tags already included in crawl_by_search) - update_trending_signal: 2 regions → 1 (first region only) - update_graph_signal: 12 sampled channels → 6 New total: ~21 yt-dlp calls per run (~105s with 5s gaps) vs ~320s before. Signal quality is preserved — the removed queries were low-marginal-value duplicates of content already covered by the remaining ones. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 02:14:25 +02:00
parent 0a4dfb845e
commit e6faf8e08e
1 changed files with 18 additions and 18 deletions
--- a/backend/services/discovery.py
+++ b/backend/services/discovery.py
@@ -255,21 +255,21 @@ def crawl_by_search(db: Session, user_id: int):
        {"user_id": user_id},
    ).mappings().all()
-    # Build query pool: top tags + random channel names + categories
+    # Keep the query count low — each query is a separate yt-dlp subprocess
-    top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:12]]
+    # (its own HTTP session). Too many back-to-back sessions look like a bot.
    top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:5]]
    top_cats = [r["category"] for r in cat_rows]
-    # Random sample of followed channel names — diversifies discovery each run
+    # A few randomly-sampled channel names — diversifies results each run
    sampled_names: list[str] = []
    if followed_names:
-        sampled_names = random.sample(followed_names, min(15, len(followed_names)))
+        sampled_names = random.sample(followed_names, min(4, len(followed_names)))
-    # Serendipity queries: "best [category] channels" — surfaces curated list videos
+    # One serendipity query to surface content outside the user's direct tag space
-    # which then get their channel indexed; broadens discovery beyond direct tag matches.
+    serendipity = [f"best {top_cats[0]} channels"] if top_cats else []
    serendipity = [f"best {cat} channels" for cat in top_cats[:3]]
-    # Combine: tags (most signal) + channel names (broad reach) + serendipity + categories
+    # Total target: ≤10 queries
-    queries = list(dict.fromkeys(top_tags + sampled_names + serendipity + top_cats))[:30]
+    queries = list(dict.fromkeys(top_tags + sampled_names + serendipity + top_cats[:2]))[:10]
    if not queries:
        return
@@ -393,7 +393,7 @@ def update_liked_signal(db: Session, user_id: int):
        {"user_id": user_id},
    ).scalars().all())
-    top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:10]]
+    top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:4]]
    neg_tags = frozenset(
        r["tag"] for r in db.execute(
            text("SELECT tag FROM user_tag_affinity WHERE user_id = :user_id AND score < -2"),
@@ -620,7 +620,7 @@ def update_graph_signal(db: Session, user_id: int):
        {"user_id": user_id},
    ).scalars().all())
-    sample = random.sample(list(followed_rows), min(12, len(followed_rows)))
+    sample = random.sample(list(followed_rows), min(6, len(followed_rows)))
    featured_map: dict[str, list[str]] = {}
    for row in sample:
@@ -673,10 +673,10 @@ def run_full_discovery(db: Session, user_id: int, regions: list[str] | None = No
    )
    db.commit()
-    crawl_by_search(db, user_id)
+    crawl_by_search(db, user_id)          # ~10 yt-dlp calls
-    update_community_signal(db, user_id)
+    update_community_signal(db, user_id)  # no yt-dlp
-    update_category_clusters(db, user_id)
+    update_category_clusters(db, user_id) # no yt-dlp
-    update_liked_signal(db, user_id)
+    update_liked_signal(db, user_id)      # ~4 yt-dlp calls
-    update_watch_signal(db, user_id)
+    # update_watch_signal skipped — tags already included in crawl_by_search
-    update_trending_signal(db, user_id, regions)
+    update_trending_signal(db, user_id, regions[:1])  # 1 yt-dlp call (first region only)
-    update_graph_signal(db, user_id)
+    update_graph_signal(db, user_id)      # ~6 yt-dlp calls