Fix discovery scoring: cap trending, prevent score inflation, add freshness
- Cap trending base_score at 18.0 (was unbounded — a viral channel could score 240+ vs search's 15, making everything else invisible) - Cap all discovery scores at 50.0 globally so no single signal dominates - Fix score accumulation: cap accumulated total at 50.0 (was unbounded across repeated runs, cementing high-score channels in top positions forever) - Expire unseen queue entries older than 14 days at start of each run - Add ±8 score perturbation to discovery list endpoint (was pure score DESC, identical every visit until dismissed) - Add score perturbation to discovery_videos ORDER BY too - Fix SQL injection in update_category_clusters (category strings were interpolated directly into query; now use parameterized queries per category) - Raise category signal score from 3.0 → 5.0 to compensate for trending cap Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,5 @@
|
|||||||
import json
|
import json
|
||||||
|
import random
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
|
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
|
||||||
@@ -81,6 +82,9 @@ def list_discovery(
|
|||||||
pass
|
pass
|
||||||
rows = [r for r in rows if neg_hit.get(r["channel_id"], 0) < 3]
|
rows = [r for r in rows if neg_hit.get(r["channel_id"], 0) < 3]
|
||||||
|
|
||||||
|
# Add score perturbation so the list doesn't look identical every visit.
|
||||||
|
# ±8 jitter keeps relative ranking meaningful while surfacing different channels.
|
||||||
|
rows = sorted(rows, key=lambda r: r["score"] + random.uniform(-8, 8), reverse=True)
|
||||||
rows = rows[:limit]
|
rows = rows[:limit]
|
||||||
items = []
|
items = []
|
||||||
for row in rows:
|
for row in rows:
|
||||||
@@ -210,7 +214,7 @@ def discovery_videos(
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
WHERE rn <= 2
|
WHERE rn <= 2
|
||||||
ORDER BY score DESC, rn ASC, RANDOM()
|
ORDER BY (score + (RANDOM() * 10 - 5)) DESC, rn ASC
|
||||||
LIMIT :limit OFFSET :offset
|
LIMIT :limit OFFSET :offset
|
||||||
"""),
|
"""),
|
||||||
{"user_id": current_user.id, "limit": limit, "offset": offset},
|
{"user_id": current_user.id, "limit": limit, "offset": offset},
|
||||||
|
|||||||
@@ -77,14 +77,18 @@ def _upsert_channel(db: Session, channel_data: dict) -> Channel | None:
|
|||||||
return channel
|
return channel
|
||||||
|
|
||||||
|
|
||||||
|
_MAX_DISCOVERY_SCORE = 50.0
|
||||||
|
|
||||||
|
|
||||||
def _add_to_discovery(
|
def _add_to_discovery(
|
||||||
db: Session, user_id: int, channel_id: int, score: float, source: str,
|
db: Session, user_id: int, channel_id: int, score: float, source: str,
|
||||||
preview_json: str | None = None,
|
preview_json: str | None = None,
|
||||||
):
|
):
|
||||||
|
score = min(score, _MAX_DISCOVERY_SCORE)
|
||||||
existing = db.query(DiscoveryQueue).filter_by(user_id=user_id, channel_id=channel_id).first()
|
existing = db.query(DiscoveryQueue).filter_by(user_id=user_id, channel_id=channel_id).first()
|
||||||
if existing:
|
if existing:
|
||||||
# Accumulate scores across sources but cap to prevent one dominant signal
|
# Accumulate across sources but cap so no single signal dominates forever
|
||||||
existing.score = existing.score + score * 0.5
|
existing.score = min(existing.score + score * 0.5, _MAX_DISCOVERY_SCORE)
|
||||||
if preview_json and not existing.preview_json:
|
if preview_json and not existing.preview_json:
|
||||||
existing.preview_json = preview_json
|
existing.preview_json = preview_json
|
||||||
return
|
return
|
||||||
@@ -322,22 +326,26 @@ def update_category_clusters(db: Session, user_id: int):
|
|||||||
if not top_categories:
|
if not top_categories:
|
||||||
return
|
return
|
||||||
|
|
||||||
placeholders = ",".join(f"'{c}'" for c in top_categories)
|
# Use JSON_EACH / parameterized IN via repeated queries to avoid SQL injection
|
||||||
candidate_rows = db.execute(
|
candidate_channel_ids: set[int] = set()
|
||||||
text(f"""
|
for cat in top_categories:
|
||||||
SELECT DISTINCT v.channel_id
|
cat_rows = db.execute(
|
||||||
FROM videos v
|
text("""
|
||||||
WHERE v.category IN ({placeholders})
|
SELECT DISTINCT v.channel_id
|
||||||
AND v.channel_id NOT IN (
|
FROM videos v
|
||||||
SELECT channel_id FROM user_channels WHERE user_id = :user_id
|
WHERE v.category = :cat
|
||||||
)
|
AND v.channel_id NOT IN (
|
||||||
LIMIT 100
|
SELECT channel_id FROM user_channels WHERE user_id = :user_id
|
||||||
"""),
|
)
|
||||||
{"user_id": user_id},
|
LIMIT 50
|
||||||
).mappings().all()
|
"""),
|
||||||
|
{"cat": cat, "user_id": user_id},
|
||||||
|
).mappings().all()
|
||||||
|
for row in cat_rows:
|
||||||
|
candidate_channel_ids.add(row["channel_id"])
|
||||||
|
|
||||||
for row in candidate_rows:
|
for channel_id in candidate_channel_ids:
|
||||||
_add_to_discovery(db, user_id, row["channel_id"], score=3.0, source="category")
|
_add_to_discovery(db, user_id, channel_id, score=5.0, source="category")
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
|
|
||||||
@@ -548,7 +556,9 @@ def update_trending_signal(db: Session, user_id: int, regions: list[str]):
|
|||||||
if uc and uc.status in ("followed", "dismissed"):
|
if uc and uc.status in ("followed", "dismissed"):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
base_score = float(info["count"]) * 4.0 * len(info["regions"])
|
# Cap base_score so a viral trending channel can't dominate the whole queue.
|
||||||
|
# count × 4.0 × regions can reach 300+ without this cap.
|
||||||
|
base_score = min(float(info["count"]) * 4.0 * len(info["regions"]), 18.0)
|
||||||
|
|
||||||
# Tag relevance: positive for liked content, negative for dismissed/disliked.
|
# Tag relevance: positive for liked content, negative for dismissed/disliked.
|
||||||
# tag_profile comes from user_tag_affinity which tracks both signals.
|
# tag_profile comes from user_tag_affinity which tracks both signals.
|
||||||
@@ -561,7 +571,7 @@ def update_trending_signal(db: Session, user_id: int, regions: list[str]):
|
|||||||
for tags_json in tag_rows:
|
for tags_json in tag_rows:
|
||||||
tag_boost += _tag_relevance_score(tag_profile, tags_json)
|
tag_boost += _tag_relevance_score(tag_profile, tags_json)
|
||||||
|
|
||||||
final_score = base_score + tag_boost
|
final_score = min(base_score + tag_boost, 25.0)
|
||||||
if final_score <= 0:
|
if final_score <= 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -646,6 +656,19 @@ def update_graph_signal(db: Session, user_id: int):
|
|||||||
def run_full_discovery(db: Session, user_id: int, regions: list[str] | None = None):
|
def run_full_discovery(db: Session, user_id: int, regions: list[str] | None = None):
|
||||||
if regions is None:
|
if regions is None:
|
||||||
regions = ["US", "SE"]
|
regions = ["US", "SE"]
|
||||||
|
|
||||||
|
# Expire unseen entries older than 14 days so stale high-score channels
|
||||||
|
# don't block fresh results forever.
|
||||||
|
db.execute(
|
||||||
|
text("""
|
||||||
|
DELETE FROM discovery_queue
|
||||||
|
WHERE user_id = :user_id AND seen = 0
|
||||||
|
AND created_at <= datetime('now', '-14 days')
|
||||||
|
"""),
|
||||||
|
{"user_id": user_id},
|
||||||
|
)
|
||||||
|
db.commit()
|
||||||
|
|
||||||
crawl_by_search(db, user_id)
|
crawl_by_search(db, user_id)
|
||||||
update_community_signal(db, user_id)
|
update_community_signal(db, user_id)
|
||||||
update_category_clusters(db, user_id)
|
update_category_clusters(db, user_id)
|
||||||
|
|||||||
Reference in New Issue
Block a user