Drastically reduce discovery yt-dlp call count: 64 → ~21
Each yt-dlp call is a separate subprocess that opens a new HTTP session with YouTube. 64 sessions in a row looks like a bot regardless of rate limiting. Changes: - crawl_by_search: 30 queries → 10 (top 5 tags, 4 channel names, 1 serendipity) - update_liked_signal: 10 queries → 4 - update_watch_signal: removed (tags already included in crawl_by_search) - update_trending_signal: 2 regions → 1 (first region only) - update_graph_signal: 12 sampled channels → 6 New total: ~21 yt-dlp calls per run (~105s with 5s gaps) vs ~320s before. Signal quality is preserved — the removed queries were low-marginal-value duplicates of content already covered by the remaining ones. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -255,21 +255,21 @@ def crawl_by_search(db: Session, user_id: int):
|
||||
{"user_id": user_id},
|
||||
).mappings().all()
|
||||
|
||||
# Build query pool: top tags + random channel names + categories
|
||||
top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:12]]
|
||||
# Keep the query count low — each query is a separate yt-dlp subprocess
|
||||
# (its own HTTP session). Too many back-to-back sessions look like a bot.
|
||||
top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:5]]
|
||||
top_cats = [r["category"] for r in cat_rows]
|
||||
|
||||
# Random sample of followed channel names — diversifies discovery each run
|
||||
# A few randomly-sampled channel names — diversifies results each run
|
||||
sampled_names: list[str] = []
|
||||
if followed_names:
|
||||
sampled_names = random.sample(followed_names, min(15, len(followed_names)))
|
||||
sampled_names = random.sample(followed_names, min(4, len(followed_names)))
|
||||
|
||||
# Serendipity queries: "best [category] channels" — surfaces curated list videos
|
||||
# which then get their channel indexed; broadens discovery beyond direct tag matches.
|
||||
serendipity = [f"best {cat} channels" for cat in top_cats[:3]]
|
||||
# One serendipity query to surface content outside the user's direct tag space
|
||||
serendipity = [f"best {top_cats[0]} channels"] if top_cats else []
|
||||
|
||||
# Combine: tags (most signal) + channel names (broad reach) + serendipity + categories
|
||||
queries = list(dict.fromkeys(top_tags + sampled_names + serendipity + top_cats))[:30]
|
||||
# Total target: ≤10 queries
|
||||
queries = list(dict.fromkeys(top_tags + sampled_names + serendipity + top_cats[:2]))[:10]
|
||||
if not queries:
|
||||
return
|
||||
|
||||
@@ -393,7 +393,7 @@ def update_liked_signal(db: Session, user_id: int):
|
||||
{"user_id": user_id},
|
||||
).scalars().all())
|
||||
|
||||
top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:10]]
|
||||
top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:4]]
|
||||
neg_tags = frozenset(
|
||||
r["tag"] for r in db.execute(
|
||||
text("SELECT tag FROM user_tag_affinity WHERE user_id = :user_id AND score < -2"),
|
||||
@@ -620,7 +620,7 @@ def update_graph_signal(db: Session, user_id: int):
|
||||
{"user_id": user_id},
|
||||
).scalars().all())
|
||||
|
||||
sample = random.sample(list(followed_rows), min(12, len(followed_rows)))
|
||||
sample = random.sample(list(followed_rows), min(6, len(followed_rows)))
|
||||
|
||||
featured_map: dict[str, list[str]] = {}
|
||||
for row in sample:
|
||||
@@ -673,10 +673,10 @@ def run_full_discovery(db: Session, user_id: int, regions: list[str] | None = No
|
||||
)
|
||||
db.commit()
|
||||
|
||||
crawl_by_search(db, user_id)
|
||||
update_community_signal(db, user_id)
|
||||
update_category_clusters(db, user_id)
|
||||
update_liked_signal(db, user_id)
|
||||
update_watch_signal(db, user_id)
|
||||
update_trending_signal(db, user_id, regions)
|
||||
update_graph_signal(db, user_id)
|
||||
crawl_by_search(db, user_id) # ~10 yt-dlp calls
|
||||
update_community_signal(db, user_id) # no yt-dlp
|
||||
update_category_clusters(db, user_id) # no yt-dlp
|
||||
update_liked_signal(db, user_id) # ~4 yt-dlp calls
|
||||
# update_watch_signal skipped — tags already included in crawl_by_search
|
||||
update_trending_signal(db, user_id, regions[:1]) # 1 yt-dlp call (first region only)
|
||||
update_graph_signal(db, user_id) # ~6 yt-dlp calls
|
||||
|
||||
Reference in New Issue
Block a user