From e6faf8e08e0a36e96a4cd9f03d771e6210483582 Mon Sep 17 00:00:00 2001
From: Mattias Thall <mattias@inputnoi.se>
Date: Wed, 27 May 2026 02:14:25 +0200
Subject: [PATCH] =?UTF-8?q?Drastically=20reduce=20discovery=20yt-dlp=20cal?=
 =?UTF-8?q?l=20count:=2064=20=E2=86=92=20~21?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each yt-dlp call is a separate subprocess that opens a new HTTP session with
YouTube. 64 sessions in a row looks like a bot regardless of rate limiting.

Changes:
- crawl_by_search: 30 queries → 10 (top 5 tags, 4 channel names, 1 serendipity)
- update_liked_signal: 10 queries → 4
- update_watch_signal: removed (tags already included in crawl_by_search)
- update_trending_signal: 2 regions → 1 (first region only)
- update_graph_signal: 12 sampled channels → 6

New total: ~21 yt-dlp calls per run (~105s with 5s gaps) vs ~320s before.
Signal quality is preserved — the removed queries were low-marginal-value
duplicates of content already covered by the remaining ones.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/services/discovery.py | 36 +++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/backend/services/discovery.py b/backend/services/discovery.py
index ded0bdc..cc90a57 100644
--- a/backend/services/discovery.py
+++ b/backend/services/discovery.py
@@ -255,21 +255,21 @@ def crawl_by_search(db: Session, user_id: int):
         {"user_id": user_id},
     ).mappings().all()
 
-    # Build query pool: top tags + random channel names + categories
-    top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:12]]
+    # Keep the query count low — each query is a separate yt-dlp subprocess
+    # (its own HTTP session). Too many back-to-back sessions look like a bot.
+    top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:5]]
     top_cats = [r["category"] for r in cat_rows]
 
-    # Random sample of followed channel names — diversifies discovery each run
+    # A few randomly-sampled channel names — diversifies results each run
     sampled_names: list[str] = []
     if followed_names:
-        sampled_names = random.sample(followed_names, min(15, len(followed_names)))
+        sampled_names = random.sample(followed_names, min(4, len(followed_names)))
 
-    # Serendipity queries: "best [category] channels" — surfaces curated list videos
-    # which then get their channel indexed; broadens discovery beyond direct tag matches.
-    serendipity = [f"best {cat} channels" for cat in top_cats[:3]]
+    # One serendipity query to surface content outside the user's direct tag space
+    serendipity = [f"best {top_cats[0]} channels"] if top_cats else []
 
-    # Combine: tags (most signal) + channel names (broad reach) + serendipity + categories
-    queries = list(dict.fromkeys(top_tags + sampled_names + serendipity + top_cats))[:30]
+    # Total target: ≤10 queries
+    queries = list(dict.fromkeys(top_tags + sampled_names + serendipity + top_cats[:2]))[:10]
     if not queries:
         return
 
@@ -393,7 +393,7 @@ def update_liked_signal(db: Session, user_id: int):
         {"user_id": user_id},
     ).scalars().all())
 
-    top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:10]]
+    top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:4]]
     neg_tags = frozenset(
         r["tag"] for r in db.execute(
             text("SELECT tag FROM user_tag_affinity WHERE user_id = :user_id AND score < -2"),
@@ -620,7 +620,7 @@ def update_graph_signal(db: Session, user_id: int):
         {"user_id": user_id},
     ).scalars().all())
 
-    sample = random.sample(list(followed_rows), min(12, len(followed_rows)))
+    sample = random.sample(list(followed_rows), min(6, len(followed_rows)))
 
     featured_map: dict[str, list[str]] = {}
     for row in sample:
@@ -673,10 +673,10 @@ def run_full_discovery(db: Session, user_id: int, regions: list[str] | None = No
     )
     db.commit()
 
-    crawl_by_search(db, user_id)
-    update_community_signal(db, user_id)
-    update_category_clusters(db, user_id)
-    update_liked_signal(db, user_id)
-    update_watch_signal(db, user_id)
-    update_trending_signal(db, user_id, regions)
-    update_graph_signal(db, user_id)
+    crawl_by_search(db, user_id)          # ~10 yt-dlp calls
+    update_community_signal(db, user_id)  # no yt-dlp
+    update_category_clusters(db, user_id) # no yt-dlp
+    update_liked_signal(db, user_id)      # ~4 yt-dlp calls
+    # update_watch_signal skipped — tags already included in crawl_by_search
+    update_trending_signal(db, user_id, regions[:1])  # 1 yt-dlp call (first region only)
+    update_graph_signal(db, user_id)      # ~6 yt-dlp calls