diff --git a/backend/services/discovery.py b/backend/services/discovery.py index ded0bdc..cc90a57 100644 --- a/backend/services/discovery.py +++ b/backend/services/discovery.py @@ -255,21 +255,21 @@ def crawl_by_search(db: Session, user_id: int): {"user_id": user_id}, ).mappings().all() - # Build query pool: top tags + random channel names + categories - top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:12]] + # Keep the query count low — each query is a separate yt-dlp subprocess + # (its own HTTP session). Too many back-to-back sessions look like a bot. + top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:5]] top_cats = [r["category"] for r in cat_rows] - # Random sample of followed channel names — diversifies discovery each run + # A few randomly-sampled channel names — diversifies results each run sampled_names: list[str] = [] if followed_names: - sampled_names = random.sample(followed_names, min(15, len(followed_names))) + sampled_names = random.sample(followed_names, min(4, len(followed_names))) - # Serendipity queries: "best [category] channels" — surfaces curated list videos - # which then get their channel indexed; broadens discovery beyond direct tag matches. - serendipity = [f"best {cat} channels" for cat in top_cats[:3]] + # One serendipity query to surface content outside the user's direct tag space + serendipity = [f"best {top_cats[0]} channels"] if top_cats else [] - # Combine: tags (most signal) + channel names (broad reach) + serendipity + categories - queries = list(dict.fromkeys(top_tags + sampled_names + serendipity + top_cats))[:30] + # Total target: ≤10 queries + queries = list(dict.fromkeys(top_tags + sampled_names + serendipity + top_cats[:2]))[:10] if not queries: return @@ -393,7 +393,7 @@ def update_liked_signal(db: Session, user_id: int): {"user_id": user_id}, ).scalars().all()) - top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:10]] + top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:4]] neg_tags = frozenset( r["tag"] for r in db.execute( text("SELECT tag FROM user_tag_affinity WHERE user_id = :user_id AND score < -2"), @@ -620,7 +620,7 @@ def update_graph_signal(db: Session, user_id: int): {"user_id": user_id}, ).scalars().all()) - sample = random.sample(list(followed_rows), min(12, len(followed_rows))) + sample = random.sample(list(followed_rows), min(6, len(followed_rows))) featured_map: dict[str, list[str]] = {} for row in sample: @@ -673,10 +673,10 @@ def run_full_discovery(db: Session, user_id: int, regions: list[str] | None = No ) db.commit() - crawl_by_search(db, user_id) - update_community_signal(db, user_id) - update_category_clusters(db, user_id) - update_liked_signal(db, user_id) - update_watch_signal(db, user_id) - update_trending_signal(db, user_id, regions) - update_graph_signal(db, user_id) + crawl_by_search(db, user_id) # ~10 yt-dlp calls + update_community_signal(db, user_id) # no yt-dlp + update_category_clusters(db, user_id) # no yt-dlp + update_liked_signal(db, user_id) # ~4 yt-dlp calls + # update_watch_signal skipped — tags already included in crawl_by_search + update_trending_signal(db, user_id, regions[:1]) # 1 yt-dlp call (first region only) + update_graph_signal(db, user_id) # ~6 yt-dlp calls