From 146a044e6958b130ee50248ba11636f90e598c45 Mon Sep 17 00:00:00 2001 From: Mattias Thall Date: Wed, 27 May 2026 02:34:05 +0200 Subject: [PATCH] Fix For You feed: replace broken jitter with proper tier-based sampling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old approach added ±12 noise to scores that span -365..+100 (recency uses raw Julian days), so the perturbation could never reorder videos that differed by more than 24 points — which is almost all of them. Every reshuffle returned the same ranking. Fixes: - Per-channel candidate window: rn <= 15 (was rn <= 5) for a much wider pool - Candidate pool: up to 600 per page (was limit * 4 = 100) - Non-overlapping page offsets: page N pulls SQL rows N*600 .. (N+1)*600 so pagination actually moves through new material instead of re-reading the same top-100 - Replaced ±12 perturbation with proper tier-based random sampling: top 40% → 60% of page, mid 40% → 30%, bottom 20% → 10% wildcards Each reshuffle picks a genuinely different mix from the score-ranked pool Co-Authored-By: Claude Sonnet 4.6 --- backend/routers/videos.py | 60 +++++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 12 deletions(-) diff --git a/backend/routers/videos.py b/backend/routers/videos.py index fbf2f6a..7f08145 100644 --- a/backend/routers/videos.py +++ b/backend/routers/videos.py @@ -298,7 +298,14 @@ def home_feed( # mode == "ranked" (default) import random as _random - candidate_limit = limit * 4 # wider pool for tier sampling + + # Pull a large candidate pool per page. Each page draws from a NON-overlapping + # slice of the scored list so pagination actually moves through new material. + # candidate_limit >> limit so tier-sampling has real variety to choose from. + candidate_limit = min(limit * 15, 600) + page_num = offset // limit if limit > 0 else 0 + sql_offset = page_num * candidate_limit # non-overlapping pages + rows = db.execute( text(f""" WITH channel_stats AS ( @@ -361,24 +368,53 @@ def home_feed( {duration_clause} ) SELECT * FROM scored - WHERE rn <= 5 + WHERE rn <= 15 ORDER BY score DESC - LIMIT :candidate_limit OFFSET :offset + LIMIT :candidate_limit OFFSET :sql_offset """), - {"user_id": current_user.id, "candidate_limit": candidate_limit, "offset": offset, + {"user_id": current_user.id, "candidate_limit": candidate_limit, "sql_offset": sql_offset, "hide_watched": 1 if hide_watched else 0, "w_recency": w_recency, "w_affinity": w_affinity, "w_channel": w_channel}, ).mappings().all() - # Tier-based sampling with score perturbation so the feed varies each load + # Tier-based sampling: scores span -365..+100+ so ±N jitter is useless. + # Instead split the ranked pool into thirds and randomly sample from each, + # so every reshuffle genuinely picks a different mix of top/mid/wildcard videos. candidates = [dict(r) for r in rows] - for c in candidates: - c["_ps"] = c["score"] + _random.uniform(-12, 12) - candidates.sort(key=lambda x: x["_ps"], reverse=True) - top = candidates[:limit] + n = len(candidates) - # Track impressions for page 0 (first visit) — penalises videos shown but ignored - if offset == 0 and top: + if n <= limit: + _random.shuffle(candidates) + top = candidates + else: + split1 = max(n * 2 // 5, limit) # top 40 % + split2 = max(n * 4 // 5, split1 + 1) # next 40 % + t1 = candidates[:split1] + t2 = candidates[split1:split2] + t3 = candidates[split2:] + + # 60 % from t1, 30 % from t2, 10 % wildcards from t3 + n1 = limit * 6 // 10 + n2 = limit * 3 // 10 + n3 = limit - n1 - n2 + + picked = ( + _random.sample(t1, min(n1, len(t1))) + + (_random.sample(t2, min(n2, len(t2))) if t2 else []) + + (_random.sample(t3, min(n3, len(t3))) if t3 else []) + ) + # Fill any shortfall when a tier was smaller than requested + if len(picked) < limit: + already = {id(x) for x in picked} + rest = [x for x in candidates if id(x) not in already] + if rest: + picked += _random.sample(rest, min(limit - len(picked), len(rest))) + + _random.shuffle(picked) + top = picked[:limit] + + # Track impressions — penalises videos shown but not clicked on repeat visits + if page_num == 0 and top: for item in top: if not item["watched"]: db.execute(text(""" @@ -390,7 +426,7 @@ def home_feed( db.commit() followed = [ - VideoDetail(**{k: v for k, v in item.items() if k not in ("watched", "score", "rn", "_ps")}, + VideoDetail(**{k: v for k, v in item.items() if k not in ("watched", "score", "rn")}, is_watched=bool(item["watched"])) for item in top ]