From 146a044e6958b130ee50248ba11636f90e598c45 Mon Sep 17 00:00:00 2001
From: Mattias Thall <mattias@inputnoi.se>
Date: Wed, 27 May 2026 02:34:05 +0200
Subject: [PATCH] Fix For You feed: replace broken jitter with proper
 tier-based sampling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The old approach added ±12 noise to scores that span -365..+100 (recency
uses raw Julian days), so the perturbation could never reorder videos that
differed by more than 24 points — which is almost all of them. Every
reshuffle returned the same ranking.

Fixes:
- Per-channel candidate window: rn <= 15 (was rn <= 5) for a much wider pool
- Candidate pool: up to 600 per page (was limit * 4 = 100)
- Non-overlapping page offsets: page N pulls SQL rows N*600 .. (N+1)*600 so
  pagination actually moves through new material instead of re-reading the same top-100
- Replaced ±12 perturbation with proper tier-based random sampling:
  top 40% → 60% of page, mid 40% → 30%, bottom 20% → 10% wildcards
  Each reshuffle picks a genuinely different mix from the score-ranked pool

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/routers/videos.py | 60 +++++++++++++++++++++++++++++++--------
 1 file changed, 48 insertions(+), 12 deletions(-)

diff --git a/backend/routers/videos.py b/backend/routers/videos.py
index fbf2f6a..7f08145 100644
--- a/backend/routers/videos.py
+++ b/backend/routers/videos.py
@@ -298,7 +298,14 @@ def home_feed(
 
     # mode == "ranked" (default)
     import random as _random
-    candidate_limit = limit * 4  # wider pool for tier sampling
+
+    # Pull a large candidate pool per page. Each page draws from a NON-overlapping
+    # slice of the scored list so pagination actually moves through new material.
+    # candidate_limit >> limit so tier-sampling has real variety to choose from.
+    candidate_limit = min(limit * 15, 600)
+    page_num = offset // limit if limit > 0 else 0
+    sql_offset = page_num * candidate_limit  # non-overlapping pages
+
     rows = db.execute(
         text(f"""
             WITH channel_stats AS (
@@ -361,24 +368,53 @@ def home_feed(
                   {duration_clause}
             )
             SELECT * FROM scored
-            WHERE rn <= 5
+            WHERE rn <= 15
             ORDER BY score DESC
-            LIMIT :candidate_limit OFFSET :offset
+            LIMIT :candidate_limit OFFSET :sql_offset
         """),
-        {"user_id": current_user.id, "candidate_limit": candidate_limit, "offset": offset,
+        {"user_id": current_user.id, "candidate_limit": candidate_limit, "sql_offset": sql_offset,
          "hide_watched": 1 if hide_watched else 0,
          "w_recency": w_recency, "w_affinity": w_affinity, "w_channel": w_channel},
     ).mappings().all()
 
-    # Tier-based sampling with score perturbation so the feed varies each load
+    # Tier-based sampling: scores span -365..+100+ so ±N jitter is useless.
+    # Instead split the ranked pool into thirds and randomly sample from each,
+    # so every reshuffle genuinely picks a different mix of top/mid/wildcard videos.
     candidates = [dict(r) for r in rows]
-    for c in candidates:
-        c["_ps"] = c["score"] + _random.uniform(-12, 12)
-    candidates.sort(key=lambda x: x["_ps"], reverse=True)
-    top = candidates[:limit]
+    n = len(candidates)
 
-    # Track impressions for page 0 (first visit) — penalises videos shown but ignored
-    if offset == 0 and top:
+    if n <= limit:
+        _random.shuffle(candidates)
+        top = candidates
+    else:
+        split1 = max(n * 2 // 5, limit)      # top 40 %
+        split2 = max(n * 4 // 5, split1 + 1) # next 40 %
+        t1 = candidates[:split1]
+        t2 = candidates[split1:split2]
+        t3 = candidates[split2:]
+
+        # 60 % from t1, 30 % from t2, 10 % wildcards from t3
+        n1 = limit * 6 // 10
+        n2 = limit * 3 // 10
+        n3 = limit - n1 - n2
+
+        picked = (
+            _random.sample(t1, min(n1, len(t1)))
+            + (_random.sample(t2, min(n2, len(t2))) if t2 else [])
+            + (_random.sample(t3, min(n3, len(t3))) if t3 else [])
+        )
+        # Fill any shortfall when a tier was smaller than requested
+        if len(picked) < limit:
+            already = {id(x) for x in picked}
+            rest = [x for x in candidates if id(x) not in already]
+            if rest:
+                picked += _random.sample(rest, min(limit - len(picked), len(rest)))
+
+        _random.shuffle(picked)
+        top = picked[:limit]
+
+    # Track impressions — penalises videos shown but not clicked on repeat visits
+    if page_num == 0 and top:
         for item in top:
             if not item["watched"]:
                 db.execute(text("""
@@ -390,7 +426,7 @@ def home_feed(
         db.commit()
 
     followed = [
-        VideoDetail(**{k: v for k, v in item.items() if k not in ("watched", "score", "rn", "_ps")},
+        VideoDetail(**{k: v for k, v in item.items() if k not in ("watched", "score", "rn")},
                     is_watched=bool(item["watched"]))
         for item in top
     ]