Add queue-based gradual discovery with shuffled call ordering and progress UI

Each yt-dlp call is now an independent task (one search query, one trending
fetch, one graph channel fetch). Tasks are shuffled together so we don't fire
10 searches in a row, then enqueued with 30-90s random gaps between them —
a full sweep of ~17 tasks completes in roughly 10-25 minutes instead of
hammering YouTube with 21 calls back-to-back.

Fast signals (community, category clusters) still run synchronously at
schedule time since they're pure SQL.

Progress is tracked per-user (total/done/running) and exposed on
GET /api/discovery/status. The Discovery page polls every 10s while
running and shows a progress bar + "Finding channels… X / Y" in the header.
The auto-discovery daemon skips scheduling if a manual sweep is already running.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-27 02:28:35 +02:00
parent e6faf8e08e
commit a535e9f22a
4 changed files with 367 additions and 44 deletions

View File

@@ -175,9 +175,11 @@ def on_startup():
finally:
db.close()
# Backfill descriptions for videos that don't have them yet (runs in background)
# Start discovery worker and backfill enrichment
import threading
from .routers.channels import _enrich_missing_task, _index_channels_batch
from .services.discovery import start_discovery_worker
start_discovery_worker()
threading.Thread(target=_enrich_missing_task, args=(50,), daemon=True).start()
def _auto_sync_daemon():
@@ -222,7 +224,7 @@ def on_startup():
import time as _time
from datetime import datetime as _dt, timedelta as _td
from sqlalchemy import text as _text
from .services.discovery import run_full_discovery
from .services.discovery import schedule_discovery, get_discovery_progress
# Wait 5 minutes after startup before the first check so the app can
# finish initialising and existing enrichment tasks can settle.
@@ -244,13 +246,12 @@ def on_startup():
last = row["last_discovery_run"]
if last is None or (_dt.utcnow() - _dt.fromisoformat(str(last))) > _td(hours=23):
uid = row["user_id"]
# Skip if a manual sweep is already running
prog = get_discovery_progress(uid)
if prog and prog.get("running"):
continue
regions = [r.strip().upper() for r in (row["discovery_regions"] or "US,SE").split(",") if r.strip()]
run_full_discovery(db, uid, regions)
db.execute(
_text("UPDATE user_settings SET last_discovery_run = :now WHERE user_id = :uid"),
{"now": _dt.utcnow(), "uid": uid},
)
db.commit()
schedule_discovery(uid, regions)
finally:
db.close()
except Exception:

View File

@@ -2,7 +2,7 @@ import json
import random
from typing import Optional
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
from sqlalchemy.orm import Session
from sqlalchemy import text
@@ -10,7 +10,7 @@ from sqlalchemy import text
from ..auth_utils import get_current_user
from ..database import get_db
from ..models import Channel, DiscoveryQueue, User, UserChannel, UserSettings
from ..services.discovery import run_full_discovery
from ..services.discovery import schedule_discovery, get_discovery_progress
router = APIRouter()
@@ -160,34 +160,14 @@ def dismiss_discovery(
@router.post("/refresh", status_code=202)
def refresh_discovery(
background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
):
s = db.query(UserSettings).filter_by(user_id=current_user.id).first()
regions_str = (s.discovery_regions if s and s.discovery_regions else "US,SE")
regions = [r.strip().upper() for r in regions_str.split(",") if r.strip()]
user_id = current_user.id
def _run_discovery():
from datetime import datetime
from ..database import SessionLocal
from sqlalchemy import text as _text
fresh_db = SessionLocal()
try:
run_full_discovery(fresh_db, user_id, regions)
fresh_db.execute(
_text("UPDATE user_settings SET last_discovery_run = :now WHERE user_id = :uid"),
{"now": datetime.utcnow(), "uid": user_id},
)
fresh_db.commit()
finally:
fresh_db.close()
background_tasks.add_task(_run_discovery)
from .channels import _enrich_missing_task
background_tasks.add_task(_enrich_missing_task, 20)
return {"detail": "Discovery refresh started"}
schedule_discovery(current_user.id, regions)
return {"detail": "Discovery queued"}
@router.get("/videos", response_model=list[dict])
@@ -263,7 +243,6 @@ def discovery_status(
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
):
from ..models import UserSettings
s = db.query(UserSettings).filter_by(user_id=current_user.id).first()
pending = db.execute(
text("SELECT COUNT(*) AS n FROM discovery_queue WHERE user_id = :uid AND seen = 0"),
@@ -272,6 +251,7 @@ def discovery_status(
return {
"last_run": s.last_discovery_run.isoformat() if s and s.last_discovery_run else None,
"pending_count": pending["n"] if pending else 0,
"progress": get_discovery_progress(current_user.id),
}

View File

@@ -1,6 +1,9 @@
"""Discovery engine — search-based crawl, trending, community signal, category clustering."""
import json
import queue as _queue
import random
import threading as _threading
import time as _time
from datetime import datetime
from sqlalchemy.orm import Session
from sqlalchemy import text
@@ -8,6 +11,16 @@ from sqlalchemy import text
from ..models import Channel, UserChannel, DiscoveryQueue, Video
from . import ytdlp
# ---------------------------------------------------------------------------
# Background task queue — spaces yt-dlp calls 30-90 s apart and shuffles
# call types so we don't fire 10 searches in a row.
# ---------------------------------------------------------------------------
_task_queue: _queue.Queue = _queue.Queue()
_progress: dict[int, dict] = {} # user_id -> {total, done, running}
_progress_lock = _threading.Lock()
_worker_started = False
_worker_lock = _threading.Lock()
def _fetch_and_index_channel(db: Session, channel: Channel):
"""Fetch full metadata + recent videos for a discovered channel."""
@@ -680,3 +693,316 @@ def run_full_discovery(db: Session, user_id: int, regions: list[str] | None = No
# update_watch_signal skipped — tags already included in crawl_by_search
update_trending_signal(db, user_id, regions[:1]) # 1 yt-dlp call (first region only)
update_graph_signal(db, user_id) # ~6 yt-dlp calls
# ---------------------------------------------------------------------------
# Queue-based gradual discovery — each yt-dlp call is its own task, shuffled
# so call types are mixed, with 30-90 s gaps between them.
# ---------------------------------------------------------------------------
def _get_followed_yt_ids(db: Session, user_id: int) -> set[str]:
return set(db.execute(
text("""
SELECT c.youtube_channel_id FROM channels c
JOIN user_channels uc ON c.id = uc.channel_id
WHERE uc.user_id = :uid AND uc.status = 'followed'
"""),
{"uid": user_id},
).scalars().all())
def _get_neg_tags(db: Session, user_id: int) -> frozenset[str]:
return frozenset(db.execute(
text("SELECT tag FROM user_tag_affinity WHERE user_id = :uid AND score < -2"),
{"uid": user_id},
).scalars().all())
def _stamp_last_run(user_id: int):
from ..database import SessionLocal
from sqlalchemy import text as _text
db = SessionLocal()
try:
db.execute(
_text("UPDATE user_settings SET last_discovery_run = :now WHERE user_id = :uid"),
{"now": datetime.utcnow(), "uid": user_id},
)
db.commit()
except Exception:
db.rollback()
finally:
db.close()
def _do_task_search(user_id: int, query: str, source: str, score_multiplier: float):
from ..database import SessionLocal
db = SessionLocal()
try:
followed_yt_ids = _get_followed_yt_ids(db, user_id)
neg_tags = _get_neg_tags(db, user_id)
_search_and_store(db, user_id, [query], followed_yt_ids, score_multiplier, source, neg_tags)
finally:
db.close()
def _do_task_trending(user_id: int, region: str):
from ..database import SessionLocal
db = SessionLocal()
try:
update_trending_signal(db, user_id, [region])
finally:
db.close()
def _fetch_graph_for_channel(db: Session, user_id: int, source_yt_id: str):
"""Fetch featured channels for one followed channel and add to discovery queue."""
followed_yt_ids = _get_followed_yt_ids(db, user_id)
dismissed_ids = set(db.execute(
text("SELECT channel_id FROM user_channels WHERE user_id = :uid AND status = 'dismissed'"),
{"uid": user_id},
).scalars().all())
try:
featured = ytdlp.fetch_featured_channels(source_yt_id)
except Exception:
return
needs_indexing: list[int] = []
for yt_id in featured:
if yt_id in followed_yt_ids:
continue
channel = db.query(Channel).filter_by(youtube_channel_id=yt_id).first()
is_new = channel is None
if not channel:
channel = Channel(youtube_channel_id=yt_id, name="", description="", thumbnail_url=None)
db.add(channel)
db.flush()
if channel.id in dismissed_ids:
continue
uc = db.query(UserChannel).filter_by(user_id=user_id, channel_id=channel.id).first()
if uc and uc.status in ("followed", "dismissed"):
continue
_add_to_discovery(db, user_id, channel.id, score=8.0, source="graph")
if is_new or not channel.crawled_at:
needs_indexing.append(channel.id)
db.commit()
for channel_id in needs_indexing[:3]:
ch = db.query(Channel).filter_by(id=channel_id).first()
if ch:
_fetch_and_index_channel(db, ch)
def _do_task_graph(user_id: int, source_yt_id: str):
from ..database import SessionLocal
db = SessionLocal()
try:
_fetch_graph_for_channel(db, user_id, source_yt_id)
finally:
db.close()
def _worker_loop():
while True:
try:
user_id, task = _task_queue.get(timeout=10)
except _queue.Empty:
continue
try:
task()
except Exception:
pass
with _progress_lock:
p = _progress.get(user_id)
if p:
p["done"] = min(p["done"] + 1, p["total"])
if p["done"] >= p["total"] and p["running"]:
p["running"] = False
_threading.Thread(target=_stamp_last_run, args=(user_id,), daemon=True).start()
_task_queue.task_done()
# Polite gap — only sleep if more tasks are waiting
if not _task_queue.empty():
_time.sleep(random.uniform(30, 90))
def start_discovery_worker():
"""Start the singleton background worker thread (idempotent)."""
global _worker_started
with _worker_lock:
if not _worker_started:
_threading.Thread(target=_worker_loop, daemon=True, name="discovery-worker").start()
_worker_started = True
def get_discovery_progress(user_id: int) -> dict | None:
with _progress_lock:
p = _progress.get(user_id)
return dict(p) if p is not None else None
def _build_search_task_args(db: Session, user_id: int) -> list[tuple[str, str, float]]:
"""Compute all search/liked query strings without executing any yt-dlp calls."""
result: list[tuple[str, str, float]] = []
followed_rows = db.execute(
text("""
SELECT c.name, c.youtube_channel_id
FROM channels c
JOIN user_channels uc ON c.id = uc.channel_id
WHERE uc.user_id = :user_id AND uc.status = 'followed'
"""),
{"user_id": user_id},
).mappings().all()
followed_names = [row["name"] for row in followed_rows if row["name"]]
tag_rows = db.execute(
text("""
SELECT tags FROM (
SELECT v.tags FROM videos v
JOIN user_channels uc ON v.channel_id = uc.channel_id
WHERE uc.user_id = :user_id AND uc.status = 'followed'
AND v.tags IS NOT NULL AND v.tags != '' AND v.tags != '[]'
LIMIT 300
)
UNION ALL
SELECT tags FROM (
SELECT v.tags FROM user_videos uv
JOIN videos v ON uv.video_id = v.id
WHERE uv.user_id = :user_id AND uv.liked = 1
AND v.tags IS NOT NULL AND v.tags != '' AND v.tags != '[]'
LIMIT 100
)
"""),
{"user_id": user_id},
).mappings().all()
tag_counts: dict[str, int] = {}
liked_tag_counts: dict[str, int] = {}
for row in tag_rows:
try:
for tag in json.loads(row["tags"]):
if isinstance(tag, str):
t = tag.lower().strip()
if 3 <= len(t) <= 40:
tag_counts[t] = tag_counts.get(t, 0) + 1
except (json.JSONDecodeError, TypeError):
continue
cat_rows = db.execute(
text("""
SELECT v.category, COUNT(*) AS cnt
FROM videos v
JOIN user_channels uc ON v.channel_id = uc.channel_id
WHERE uc.user_id = :user_id AND uc.status = 'followed'
AND v.category IS NOT NULL
GROUP BY v.category
ORDER BY cnt DESC
LIMIT 5
"""),
{"user_id": user_id},
).mappings().all()
top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:5]]
top_cats = [r["category"] for r in cat_rows]
sampled_names = random.sample(followed_names, min(4, len(followed_names))) if followed_names else []
serendipity = [f"best {top_cats[0]} channels"] if top_cats else []
search_queries = list(dict.fromkeys(top_tags + sampled_names + serendipity + top_cats[:2]))[:10]
for q in search_queries:
result.append((q, "search", 5.0))
# Liked signal queries
liked_rows = db.execute(
text("""
SELECT v.tags FROM user_videos uv
JOIN videos v ON uv.video_id = v.id
WHERE uv.user_id = :user_id AND uv.liked = 1
AND v.tags IS NOT NULL AND v.tags != '' AND v.tags != '[]'
"""),
{"user_id": user_id},
).mappings().all()
for row in liked_rows:
try:
for tag in json.loads(row["tags"]):
if isinstance(tag, str):
t = tag.lower().strip()
if 3 <= len(t) <= 40:
liked_tag_counts[t] = liked_tag_counts.get(t, 0) + 2
except (json.JSONDecodeError, TypeError):
pass
for q in [t for t, _ in sorted(liked_tag_counts.items(), key=lambda x: -x[1])[:4]]:
result.append((q, "liked", 10.0))
return result
def _sample_graph_yt_ids(db: Session, user_id: int) -> list[str]:
rows = db.execute(
text("""
SELECT c.youtube_channel_id
FROM channels c
JOIN user_channels uc ON c.id = uc.channel_id
WHERE uc.user_id = :user_id AND uc.status = 'followed'
AND c.youtube_channel_id IS NOT NULL
"""),
{"user_id": user_id},
).scalars().all()
if not rows:
return []
return random.sample(list(rows), min(6, len(rows)))
def schedule_discovery(user_id: int, regions: list[str] | None = None):
"""Schedule a full discovery sweep, spreading yt-dlp calls 30-90 s apart
with call types shuffled so searches, graph fetches, and trending are mixed."""
if regions is None:
regions = ["US", "SE"]
from ..database import SessionLocal
# Fast signals (pure SQL, no yt-dlp) run synchronously right now
db = SessionLocal()
try:
db.execute(
text("""
DELETE FROM discovery_queue
WHERE user_id = :uid AND seen = 0
AND created_at <= datetime('now', '-14 days')
"""),
{"uid": user_id},
)
db.commit()
update_community_signal(db, user_id)
update_category_clusters(db, user_id)
search_args = _build_search_task_args(db, user_id)
graph_yt_ids = _sample_graph_yt_ids(db, user_id)
finally:
db.close()
# Build one task per yt-dlp call, then shuffle to mix call types
tasks: list[tuple[int, object]] = []
for query, source, mult in search_args:
tasks.append((user_id, lambda q=query, s=source, m=mult: _do_task_search(user_id, q, s, m)))
for region in regions[:1]:
tasks.append((user_id, lambda r=region: _do_task_trending(user_id, r)))
for yt_id in graph_yt_ids:
tasks.append((user_id, lambda y=yt_id: _do_task_graph(user_id, y)))
random.shuffle(tasks)
with _progress_lock:
_progress[user_id] = {"total": len(tasks), "done": 0, "running": bool(tasks)}
for item in tasks:
_task_queue.put(item)
if not tasks:
_stamp_last_run(user_id)