Fix For You feed: replace broken jitter with proper tier-based sampling

The old approach added ±12 noise to scores that span -365..+100 (recency uses raw Julian days), so the perturbation could never reorder videos that differed by more than 24 points — which is almost all of them. Every reshuffle returned the same ranking. Fixes: - Per-channel candidate window: rn <= 15 (was rn <= 5) for a much wider pool - Candidate pool: up to 600 per page (was limit * 4 = 100) - Non-overlapping page offsets: page N pulls SQL rows N*600 .. (N+1)*600 so pagination actually moves through new material instead of re-reading the same top-100 - Replaced ±12 perturbation with proper tier-based random sampling: top 40% → 60% of page, mid 40% → 30%, bottom 20% → 10% wildcards Each reshuffle picks a genuinely different mix from the score-ranked pool Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 02:34:05 +02:00
parent a535e9f22a
commit 146a044e69
1 changed files with 48 additions and 12 deletions
--- a/backend/routers/videos.py
+++ b/backend/routers/videos.py
@@ -298,7 +298,14 @@ def home_feed(

    # mode == "ranked" (default)
    import random as _random
-    candidate_limit = limit * 4  # wider pool for tier sampling
+
+    # Pull a large candidate pool per page. Each page draws from a NON-overlapping
+    # slice of the scored list so pagination actually moves through new material.
+    # candidate_limit >> limit so tier-sampling has real variety to choose from.
+    candidate_limit = min(limit * 15, 600)
+    page_num = offset // limit if limit > 0 else 0
+    sql_offset = page_num * candidate_limit  # non-overlapping pages
+
    rows = db.execute(
        text(f"""
            WITH channel_stats AS (
@@ -361,24 +368,53 @@ def home_feed(
                  {duration_clause}
            )
            SELECT * FROM scored
-            WHERE rn <= 5
+            WHERE rn <= 15
            ORDER BY score DESC
-            LIMIT :candidate_limit OFFSET :offset
+            LIMIT :candidate_limit OFFSET :sql_offset
        """),
-        {"user_id": current_user.id, "candidate_limit": candidate_limit, "offset": offset,
+        {"user_id": current_user.id, "candidate_limit": candidate_limit, "sql_offset": sql_offset,
         "hide_watched": 1 if hide_watched else 0,
         "w_recency": w_recency, "w_affinity": w_affinity, "w_channel": w_channel},
    ).mappings().all()

-    # Tier-based sampling with score perturbation so the feed varies each load
+    # Tier-based sampling: scores span -365..+100+ so ±N jitter is useless.
+    # Instead split the ranked pool into thirds and randomly sample from each,
+    # so every reshuffle genuinely picks a different mix of top/mid/wildcard videos.
    candidates = [dict(r) for r in rows]
-    for c in candidates:
-        c["_ps"] = c["score"] + _random.uniform(-12, 12)
-    candidates.sort(key=lambda x: x["_ps"], reverse=True)
-    top = candidates[:limit]
+    n = len(candidates)

-    # Track impressions for page 0 (first visit) — penalises videos shown but ignored
-    if offset == 0 and top:
+    if n <= limit:
+        _random.shuffle(candidates)
+        top = candidates
+    else:
+        split1 = max(n * 2 // 5, limit)      # top 40 %
+        split2 = max(n * 4 // 5, split1 + 1) # next 40 %
+        t1 = candidates[:split1]
+        t2 = candidates[split1:split2]
+        t3 = candidates[split2:]
+
+        # 60 % from t1, 30 % from t2, 10 % wildcards from t3
+        n1 = limit * 6 // 10
+        n2 = limit * 3 // 10
+        n3 = limit - n1 - n2
+
+        picked = (
+            _random.sample(t1, min(n1, len(t1)))
+            + (_random.sample(t2, min(n2, len(t2))) if t2 else [])
+            + (_random.sample(t3, min(n3, len(t3))) if t3 else [])
+        )
+        # Fill any shortfall when a tier was smaller than requested
+        if len(picked) < limit:
+            already = {id(x) for x in picked}
+            rest = [x for x in candidates if id(x) not in already]
+            if rest:
+                picked += _random.sample(rest, min(limit - len(picked), len(rest)))
+
+        _random.shuffle(picked)
+        top = picked[:limit]
+
+    # Track impressions — penalises videos shown but not clicked on repeat visits
+    if page_num == 0 and top:
        for item in top:
            if not item["watched"]:
                db.execute(text("""
@@ -390,7 +426,7 @@ def home_feed(
        db.commit()

    followed = [
-        VideoDetail(**{k: v for k, v in item.items() if k not in ("watched", "score", "rn", "_ps")},
+        VideoDetail(**{k: v for k, v in item.items() if k not in ("watched", "score", "rn")},
                    is_watched=bool(item["watched"]))
        for item in top
    ]