diff --git a/backend/routers/discovery.py b/backend/routers/discovery.py index 8e434bb..741653c 100644 --- a/backend/routers/discovery.py +++ b/backend/routers/discovery.py @@ -1,4 +1,5 @@ import json +import random from typing import Optional from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException @@ -81,6 +82,9 @@ def list_discovery( pass rows = [r for r in rows if neg_hit.get(r["channel_id"], 0) < 3] + # Add score perturbation so the list doesn't look identical every visit. + # ±8 jitter keeps relative ranking meaningful while surfacing different channels. + rows = sorted(rows, key=lambda r: r["score"] + random.uniform(-8, 8), reverse=True) rows = rows[:limit] items = [] for row in rows: @@ -210,7 +214,7 @@ def discovery_videos( ) ) WHERE rn <= 2 - ORDER BY score DESC, rn ASC, RANDOM() + ORDER BY (score + (RANDOM() * 10 - 5)) DESC, rn ASC LIMIT :limit OFFSET :offset """), {"user_id": current_user.id, "limit": limit, "offset": offset}, diff --git a/backend/services/discovery.py b/backend/services/discovery.py index bb78f91..172317f 100644 --- a/backend/services/discovery.py +++ b/backend/services/discovery.py @@ -77,14 +77,18 @@ def _upsert_channel(db: Session, channel_data: dict) -> Channel | None: return channel +_MAX_DISCOVERY_SCORE = 50.0 + + def _add_to_discovery( db: Session, user_id: int, channel_id: int, score: float, source: str, preview_json: str | None = None, ): + score = min(score, _MAX_DISCOVERY_SCORE) existing = db.query(DiscoveryQueue).filter_by(user_id=user_id, channel_id=channel_id).first() if existing: - # Accumulate scores across sources but cap to prevent one dominant signal - existing.score = existing.score + score * 0.5 + # Accumulate across sources but cap so no single signal dominates forever + existing.score = min(existing.score + score * 0.5, _MAX_DISCOVERY_SCORE) if preview_json and not existing.preview_json: existing.preview_json = preview_json return @@ -322,22 +326,26 @@ def update_category_clusters(db: Session, user_id: int): if not top_categories: return - placeholders = ",".join(f"'{c}'" for c in top_categories) - candidate_rows = db.execute( - text(f""" - SELECT DISTINCT v.channel_id - FROM videos v - WHERE v.category IN ({placeholders}) - AND v.channel_id NOT IN ( - SELECT channel_id FROM user_channels WHERE user_id = :user_id - ) - LIMIT 100 - """), - {"user_id": user_id}, - ).mappings().all() + # Use JSON_EACH / parameterized IN via repeated queries to avoid SQL injection + candidate_channel_ids: set[int] = set() + for cat in top_categories: + cat_rows = db.execute( + text(""" + SELECT DISTINCT v.channel_id + FROM videos v + WHERE v.category = :cat + AND v.channel_id NOT IN ( + SELECT channel_id FROM user_channels WHERE user_id = :user_id + ) + LIMIT 50 + """), + {"cat": cat, "user_id": user_id}, + ).mappings().all() + for row in cat_rows: + candidate_channel_ids.add(row["channel_id"]) - for row in candidate_rows: - _add_to_discovery(db, user_id, row["channel_id"], score=3.0, source="category") + for channel_id in candidate_channel_ids: + _add_to_discovery(db, user_id, channel_id, score=5.0, source="category") db.commit() @@ -548,7 +556,9 @@ def update_trending_signal(db: Session, user_id: int, regions: list[str]): if uc and uc.status in ("followed", "dismissed"): continue - base_score = float(info["count"]) * 4.0 * len(info["regions"]) + # Cap base_score so a viral trending channel can't dominate the whole queue. + # count × 4.0 × regions can reach 300+ without this cap. + base_score = min(float(info["count"]) * 4.0 * len(info["regions"]), 18.0) # Tag relevance: positive for liked content, negative for dismissed/disliked. # tag_profile comes from user_tag_affinity which tracks both signals. @@ -561,7 +571,7 @@ def update_trending_signal(db: Session, user_id: int, regions: list[str]): for tags_json in tag_rows: tag_boost += _tag_relevance_score(tag_profile, tags_json) - final_score = base_score + tag_boost + final_score = min(base_score + tag_boost, 25.0) if final_score <= 0: continue @@ -646,6 +656,19 @@ def update_graph_signal(db: Session, user_id: int): def run_full_discovery(db: Session, user_id: int, regions: list[str] | None = None): if regions is None: regions = ["US", "SE"] + + # Expire unseen entries older than 14 days so stale high-score channels + # don't block fresh results forever. + db.execute( + text(""" + DELETE FROM discovery_queue + WHERE user_id = :user_id AND seen = 0 + AND created_at <= datetime('now', '-14 days') + """), + {"user_id": user_id}, + ) + db.commit() + crawl_by_search(db, user_id) update_community_signal(db, user_id) update_category_clusters(db, user_id)