Drastically reduce discovery yt-dlp call count: 64 → ~21

Each yt-dlp call is a separate subprocess that opens a new HTTP session with
YouTube. 64 sessions in a row looks like a bot regardless of rate limiting.

Changes:
- crawl_by_search: 30 queries → 10 (top 5 tags, 4 channel names, 1 serendipity)
- update_liked_signal: 10 queries → 4
- update_watch_signal: removed (tags already included in crawl_by_search)
- update_trending_signal: 2 regions → 1 (first region only)
- update_graph_signal: 12 sampled channels → 6

New total: ~21 yt-dlp calls per run (~105s with 5s gaps) vs ~320s before.
Signal quality is preserved — the removed queries were low-marginal-value
duplicates of content already covered by the remaining ones.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-27 02:14:25 +02:00
parent 0a4dfb845e
commit e6faf8e08e

View File

@@ -255,21 +255,21 @@ def crawl_by_search(db: Session, user_id: int):
{"user_id": user_id}, {"user_id": user_id},
).mappings().all() ).mappings().all()
# Build query pool: top tags + random channel names + categories # Keep the query count low — each query is a separate yt-dlp subprocess
top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:12]] # (its own HTTP session). Too many back-to-back sessions look like a bot.
top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:5]]
top_cats = [r["category"] for r in cat_rows] top_cats = [r["category"] for r in cat_rows]
# Random sample of followed channel names — diversifies discovery each run # A few randomly-sampled channel names — diversifies results each run
sampled_names: list[str] = [] sampled_names: list[str] = []
if followed_names: if followed_names:
sampled_names = random.sample(followed_names, min(15, len(followed_names))) sampled_names = random.sample(followed_names, min(4, len(followed_names)))
# Serendipity queries: "best [category] channels" — surfaces curated list videos # One serendipity query to surface content outside the user's direct tag space
# which then get their channel indexed; broadens discovery beyond direct tag matches. serendipity = [f"best {top_cats[0]} channels"] if top_cats else []
serendipity = [f"best {cat} channels" for cat in top_cats[:3]]
# Combine: tags (most signal) + channel names (broad reach) + serendipity + categories # Total target: ≤10 queries
queries = list(dict.fromkeys(top_tags + sampled_names + serendipity + top_cats))[:30] queries = list(dict.fromkeys(top_tags + sampled_names + serendipity + top_cats[:2]))[:10]
if not queries: if not queries:
return return
@@ -393,7 +393,7 @@ def update_liked_signal(db: Session, user_id: int):
{"user_id": user_id}, {"user_id": user_id},
).scalars().all()) ).scalars().all())
top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:10]] top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:4]]
neg_tags = frozenset( neg_tags = frozenset(
r["tag"] for r in db.execute( r["tag"] for r in db.execute(
text("SELECT tag FROM user_tag_affinity WHERE user_id = :user_id AND score < -2"), text("SELECT tag FROM user_tag_affinity WHERE user_id = :user_id AND score < -2"),
@@ -620,7 +620,7 @@ def update_graph_signal(db: Session, user_id: int):
{"user_id": user_id}, {"user_id": user_id},
).scalars().all()) ).scalars().all())
sample = random.sample(list(followed_rows), min(12, len(followed_rows))) sample = random.sample(list(followed_rows), min(6, len(followed_rows)))
featured_map: dict[str, list[str]] = {} featured_map: dict[str, list[str]] = {}
for row in sample: for row in sample:
@@ -673,10 +673,10 @@ def run_full_discovery(db: Session, user_id: int, regions: list[str] | None = No
) )
db.commit() db.commit()
crawl_by_search(db, user_id) crawl_by_search(db, user_id) # ~10 yt-dlp calls
update_community_signal(db, user_id) update_community_signal(db, user_id) # no yt-dlp
update_category_clusters(db, user_id) update_category_clusters(db, user_id) # no yt-dlp
update_liked_signal(db, user_id) update_liked_signal(db, user_id) # ~4 yt-dlp calls
update_watch_signal(db, user_id) # update_watch_signal skipped — tags already included in crawl_by_search
update_trending_signal(db, user_id, regions) update_trending_signal(db, user_id, regions[:1]) # 1 yt-dlp call (first region only)
update_graph_signal(db, user_id) update_graph_signal(db, user_id) # ~6 yt-dlp calls