Fix For You feed: replace broken jitter with proper tier-based sampling

The old approach added ±12 noise to scores that span -365..+100 (recency
uses raw Julian days), so the perturbation could never reorder videos that
differed by more than 24 points — which is almost all of them. Every
reshuffle returned the same ranking.

Fixes:
- Per-channel candidate window: rn <= 15 (was rn <= 5) for a much wider pool
- Candidate pool: up to 600 per page (was limit * 4 = 100)
- Non-overlapping page offsets: page N pulls SQL rows N*600 .. (N+1)*600 so
  pagination actually moves through new material instead of re-reading the same top-100
- Replaced ±12 perturbation with proper tier-based random sampling:
  top 40% → 60% of page, mid 40% → 30%, bottom 20% → 10% wildcards
  Each reshuffle picks a genuinely different mix from the score-ranked pool

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-27 02:34:05 +02:00
parent a535e9f22a
commit 146a044e69

View File

@@ -298,7 +298,14 @@ def home_feed(
# mode == "ranked" (default)
import random as _random
candidate_limit = limit * 4 # wider pool for tier sampling
# Pull a large candidate pool per page. Each page draws from a NON-overlapping
# slice of the scored list so pagination actually moves through new material.
# candidate_limit >> limit so tier-sampling has real variety to choose from.
candidate_limit = min(limit * 15, 600)
page_num = offset // limit if limit > 0 else 0
sql_offset = page_num * candidate_limit # non-overlapping pages
rows = db.execute(
text(f"""
WITH channel_stats AS (
@@ -361,24 +368,53 @@ def home_feed(
{duration_clause}
)
SELECT * FROM scored
WHERE rn <= 5
WHERE rn <= 15
ORDER BY score DESC
LIMIT :candidate_limit OFFSET :offset
LIMIT :candidate_limit OFFSET :sql_offset
"""),
{"user_id": current_user.id, "candidate_limit": candidate_limit, "offset": offset,
{"user_id": current_user.id, "candidate_limit": candidate_limit, "sql_offset": sql_offset,
"hide_watched": 1 if hide_watched else 0,
"w_recency": w_recency, "w_affinity": w_affinity, "w_channel": w_channel},
).mappings().all()
# Tier-based sampling with score perturbation so the feed varies each load
# Tier-based sampling: scores span -365..+100+ so ±N jitter is useless.
# Instead split the ranked pool into thirds and randomly sample from each,
# so every reshuffle genuinely picks a different mix of top/mid/wildcard videos.
candidates = [dict(r) for r in rows]
for c in candidates:
c["_ps"] = c["score"] + _random.uniform(-12, 12)
candidates.sort(key=lambda x: x["_ps"], reverse=True)
top = candidates[:limit]
n = len(candidates)
# Track impressions for page 0 (first visit) — penalises videos shown but ignored
if offset == 0 and top:
if n <= limit:
_random.shuffle(candidates)
top = candidates
else:
split1 = max(n * 2 // 5, limit) # top 40 %
split2 = max(n * 4 // 5, split1 + 1) # next 40 %
t1 = candidates[:split1]
t2 = candidates[split1:split2]
t3 = candidates[split2:]
# 60 % from t1, 30 % from t2, 10 % wildcards from t3
n1 = limit * 6 // 10
n2 = limit * 3 // 10
n3 = limit - n1 - n2
picked = (
_random.sample(t1, min(n1, len(t1)))
+ (_random.sample(t2, min(n2, len(t2))) if t2 else [])
+ (_random.sample(t3, min(n3, len(t3))) if t3 else [])
)
# Fill any shortfall when a tier was smaller than requested
if len(picked) < limit:
already = {id(x) for x in picked}
rest = [x for x in candidates if id(x) not in already]
if rest:
picked += _random.sample(rest, min(limit - len(picked), len(rest)))
_random.shuffle(picked)
top = picked[:limit]
# Track impressions — penalises videos shown but not clicked on repeat visits
if page_num == 0 and top:
for item in top:
if not item["watched"]:
db.execute(text("""
@@ -390,7 +426,7 @@ def home_feed(
db.commit()
followed = [
VideoDetail(**{k: v for k, v in item.items() if k not in ("watched", "score", "rn", "_ps")},
VideoDetail(**{k: v for k, v in item.items() if k not in ("watched", "score", "rn")},
is_watched=bool(item["watched"]))
for item in top
]