From e6faf8e08e0a36e96a4cd9f03d771e6210483582 Mon Sep 17 00:00:00 2001 From: Mattias Thall Date: Wed, 27 May 2026 02:14:25 +0200 Subject: [PATCH] =?UTF-8?q?Drastically=20reduce=20discovery=20yt-dlp=20cal?= =?UTF-8?q?l=20count:=2064=20=E2=86=92=20~21?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each yt-dlp call is a separate subprocess that opens a new HTTP session with YouTube. 64 sessions in a row looks like a bot regardless of rate limiting. Changes: - crawl_by_search: 30 queries → 10 (top 5 tags, 4 channel names, 1 serendipity) - update_liked_signal: 10 queries → 4 - update_watch_signal: removed (tags already included in crawl_by_search) - update_trending_signal: 2 regions → 1 (first region only) - update_graph_signal: 12 sampled channels → 6 New total: ~21 yt-dlp calls per run (~105s with 5s gaps) vs ~320s before. Signal quality is preserved — the removed queries were low-marginal-value duplicates of content already covered by the remaining ones. Co-Authored-By: Claude Sonnet 4.6 --- backend/services/discovery.py | 36 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/backend/services/discovery.py b/backend/services/discovery.py index ded0bdc..cc90a57 100644 --- a/backend/services/discovery.py +++ b/backend/services/discovery.py @@ -255,21 +255,21 @@ def crawl_by_search(db: Session, user_id: int): {"user_id": user_id}, ).mappings().all() - # Build query pool: top tags + random channel names + categories - top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:12]] + # Keep the query count low — each query is a separate yt-dlp subprocess + # (its own HTTP session). Too many back-to-back sessions look like a bot. + top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:5]] top_cats = [r["category"] for r in cat_rows] - # Random sample of followed channel names — diversifies discovery each run + # A few randomly-sampled channel names — diversifies results each run sampled_names: list[str] = [] if followed_names: - sampled_names = random.sample(followed_names, min(15, len(followed_names))) + sampled_names = random.sample(followed_names, min(4, len(followed_names))) - # Serendipity queries: "best [category] channels" — surfaces curated list videos - # which then get their channel indexed; broadens discovery beyond direct tag matches. - serendipity = [f"best {cat} channels" for cat in top_cats[:3]] + # One serendipity query to surface content outside the user's direct tag space + serendipity = [f"best {top_cats[0]} channels"] if top_cats else [] - # Combine: tags (most signal) + channel names (broad reach) + serendipity + categories - queries = list(dict.fromkeys(top_tags + sampled_names + serendipity + top_cats))[:30] + # Total target: ≤10 queries + queries = list(dict.fromkeys(top_tags + sampled_names + serendipity + top_cats[:2]))[:10] if not queries: return @@ -393,7 +393,7 @@ def update_liked_signal(db: Session, user_id: int): {"user_id": user_id}, ).scalars().all()) - top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:10]] + top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:4]] neg_tags = frozenset( r["tag"] for r in db.execute( text("SELECT tag FROM user_tag_affinity WHERE user_id = :user_id AND score < -2"), @@ -620,7 +620,7 @@ def update_graph_signal(db: Session, user_id: int): {"user_id": user_id}, ).scalars().all()) - sample = random.sample(list(followed_rows), min(12, len(followed_rows))) + sample = random.sample(list(followed_rows), min(6, len(followed_rows))) featured_map: dict[str, list[str]] = {} for row in sample: @@ -673,10 +673,10 @@ def run_full_discovery(db: Session, user_id: int, regions: list[str] | None = No ) db.commit() - crawl_by_search(db, user_id) - update_community_signal(db, user_id) - update_category_clusters(db, user_id) - update_liked_signal(db, user_id) - update_watch_signal(db, user_id) - update_trending_signal(db, user_id, regions) - update_graph_signal(db, user_id) + crawl_by_search(db, user_id) # ~10 yt-dlp calls + update_community_signal(db, user_id) # no yt-dlp + update_category_clusters(db, user_id) # no yt-dlp + update_liked_signal(db, user_id) # ~4 yt-dlp calls + # update_watch_signal skipped — tags already included in crawl_by_search + update_trending_signal(db, user_id, regions[:1]) # 1 yt-dlp call (first region only) + update_graph_signal(db, user_id) # ~6 yt-dlp calls