Add queue-based gradual discovery with shuffled call ordering and progress UI

Each yt-dlp call is now an independent task (one search query, one trending
fetch, one graph channel fetch). Tasks are shuffled together so we don't fire
10 searches in a row, then enqueued with 30-90s random gaps between them —
a full sweep of ~17 tasks completes in roughly 10-25 minutes instead of
hammering YouTube with 21 calls back-to-back.

Fast signals (community, category clusters) still run synchronously at
schedule time since they're pure SQL.

Progress is tracked per-user (total/done/running) and exposed on
GET /api/discovery/status. The Discovery page polls every 10s while
running and shows a progress bar + "Finding channels… X / Y" in the header.
The auto-discovery daemon skips scheduling if a manual sweep is already running.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-27 02:28:35 +02:00
parent e6faf8e08e
commit a535e9f22a
4 changed files with 367 additions and 44 deletions

View File

@@ -1,6 +1,9 @@
"""Discovery engine — search-based crawl, trending, community signal, category clustering."""
import json
import queue as _queue
import random
import threading as _threading
import time as _time
from datetime import datetime
from sqlalchemy.orm import Session
from sqlalchemy import text
@@ -8,6 +11,16 @@ from sqlalchemy import text
from ..models import Channel, UserChannel, DiscoveryQueue, Video
from . import ytdlp
# ---------------------------------------------------------------------------
# Background task queue — spaces yt-dlp calls 30-90 s apart and shuffles
# call types so we don't fire 10 searches in a row.
# ---------------------------------------------------------------------------
_task_queue: _queue.Queue = _queue.Queue()
_progress: dict[int, dict] = {} # user_id -> {total, done, running}
_progress_lock = _threading.Lock()
_worker_started = False
_worker_lock = _threading.Lock()
def _fetch_and_index_channel(db: Session, channel: Channel):
"""Fetch full metadata + recent videos for a discovered channel."""
@@ -680,3 +693,316 @@ def run_full_discovery(db: Session, user_id: int, regions: list[str] | None = No
# update_watch_signal skipped — tags already included in crawl_by_search
update_trending_signal(db, user_id, regions[:1]) # 1 yt-dlp call (first region only)
update_graph_signal(db, user_id) # ~6 yt-dlp calls
# ---------------------------------------------------------------------------
# Queue-based gradual discovery — each yt-dlp call is its own task, shuffled
# so call types are mixed, with 30-90 s gaps between them.
# ---------------------------------------------------------------------------
def _get_followed_yt_ids(db: Session, user_id: int) -> set[str]:
return set(db.execute(
text("""
SELECT c.youtube_channel_id FROM channels c
JOIN user_channels uc ON c.id = uc.channel_id
WHERE uc.user_id = :uid AND uc.status = 'followed'
"""),
{"uid": user_id},
).scalars().all())
def _get_neg_tags(db: Session, user_id: int) -> frozenset[str]:
return frozenset(db.execute(
text("SELECT tag FROM user_tag_affinity WHERE user_id = :uid AND score < -2"),
{"uid": user_id},
).scalars().all())
def _stamp_last_run(user_id: int):
from ..database import SessionLocal
from sqlalchemy import text as _text
db = SessionLocal()
try:
db.execute(
_text("UPDATE user_settings SET last_discovery_run = :now WHERE user_id = :uid"),
{"now": datetime.utcnow(), "uid": user_id},
)
db.commit()
except Exception:
db.rollback()
finally:
db.close()
def _do_task_search(user_id: int, query: str, source: str, score_multiplier: float):
from ..database import SessionLocal
db = SessionLocal()
try:
followed_yt_ids = _get_followed_yt_ids(db, user_id)
neg_tags = _get_neg_tags(db, user_id)
_search_and_store(db, user_id, [query], followed_yt_ids, score_multiplier, source, neg_tags)
finally:
db.close()
def _do_task_trending(user_id: int, region: str):
from ..database import SessionLocal
db = SessionLocal()
try:
update_trending_signal(db, user_id, [region])
finally:
db.close()
def _fetch_graph_for_channel(db: Session, user_id: int, source_yt_id: str):
"""Fetch featured channels for one followed channel and add to discovery queue."""
followed_yt_ids = _get_followed_yt_ids(db, user_id)
dismissed_ids = set(db.execute(
text("SELECT channel_id FROM user_channels WHERE user_id = :uid AND status = 'dismissed'"),
{"uid": user_id},
).scalars().all())
try:
featured = ytdlp.fetch_featured_channels(source_yt_id)
except Exception:
return
needs_indexing: list[int] = []
for yt_id in featured:
if yt_id in followed_yt_ids:
continue
channel = db.query(Channel).filter_by(youtube_channel_id=yt_id).first()
is_new = channel is None
if not channel:
channel = Channel(youtube_channel_id=yt_id, name="", description="", thumbnail_url=None)
db.add(channel)
db.flush()
if channel.id in dismissed_ids:
continue
uc = db.query(UserChannel).filter_by(user_id=user_id, channel_id=channel.id).first()
if uc and uc.status in ("followed", "dismissed"):
continue
_add_to_discovery(db, user_id, channel.id, score=8.0, source="graph")
if is_new or not channel.crawled_at:
needs_indexing.append(channel.id)
db.commit()
for channel_id in needs_indexing[:3]:
ch = db.query(Channel).filter_by(id=channel_id).first()
if ch:
_fetch_and_index_channel(db, ch)
def _do_task_graph(user_id: int, source_yt_id: str):
from ..database import SessionLocal
db = SessionLocal()
try:
_fetch_graph_for_channel(db, user_id, source_yt_id)
finally:
db.close()
def _worker_loop():
while True:
try:
user_id, task = _task_queue.get(timeout=10)
except _queue.Empty:
continue
try:
task()
except Exception:
pass
with _progress_lock:
p = _progress.get(user_id)
if p:
p["done"] = min(p["done"] + 1, p["total"])
if p["done"] >= p["total"] and p["running"]:
p["running"] = False
_threading.Thread(target=_stamp_last_run, args=(user_id,), daemon=True).start()
_task_queue.task_done()
# Polite gap — only sleep if more tasks are waiting
if not _task_queue.empty():
_time.sleep(random.uniform(30, 90))
def start_discovery_worker():
"""Start the singleton background worker thread (idempotent)."""
global _worker_started
with _worker_lock:
if not _worker_started:
_threading.Thread(target=_worker_loop, daemon=True, name="discovery-worker").start()
_worker_started = True
def get_discovery_progress(user_id: int) -> dict | None:
with _progress_lock:
p = _progress.get(user_id)
return dict(p) if p is not None else None
def _build_search_task_args(db: Session, user_id: int) -> list[tuple[str, str, float]]:
"""Compute all search/liked query strings without executing any yt-dlp calls."""
result: list[tuple[str, str, float]] = []
followed_rows = db.execute(
text("""
SELECT c.name, c.youtube_channel_id
FROM channels c
JOIN user_channels uc ON c.id = uc.channel_id
WHERE uc.user_id = :user_id AND uc.status = 'followed'
"""),
{"user_id": user_id},
).mappings().all()
followed_names = [row["name"] for row in followed_rows if row["name"]]
tag_rows = db.execute(
text("""
SELECT tags FROM (
SELECT v.tags FROM videos v
JOIN user_channels uc ON v.channel_id = uc.channel_id
WHERE uc.user_id = :user_id AND uc.status = 'followed'
AND v.tags IS NOT NULL AND v.tags != '' AND v.tags != '[]'
LIMIT 300
)
UNION ALL
SELECT tags FROM (
SELECT v.tags FROM user_videos uv
JOIN videos v ON uv.video_id = v.id
WHERE uv.user_id = :user_id AND uv.liked = 1
AND v.tags IS NOT NULL AND v.tags != '' AND v.tags != '[]'
LIMIT 100
)
"""),
{"user_id": user_id},
).mappings().all()
tag_counts: dict[str, int] = {}
liked_tag_counts: dict[str, int] = {}
for row in tag_rows:
try:
for tag in json.loads(row["tags"]):
if isinstance(tag, str):
t = tag.lower().strip()
if 3 <= len(t) <= 40:
tag_counts[t] = tag_counts.get(t, 0) + 1
except (json.JSONDecodeError, TypeError):
continue
cat_rows = db.execute(
text("""
SELECT v.category, COUNT(*) AS cnt
FROM videos v
JOIN user_channels uc ON v.channel_id = uc.channel_id
WHERE uc.user_id = :user_id AND uc.status = 'followed'
AND v.category IS NOT NULL
GROUP BY v.category
ORDER BY cnt DESC
LIMIT 5
"""),
{"user_id": user_id},
).mappings().all()
top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:5]]
top_cats = [r["category"] for r in cat_rows]
sampled_names = random.sample(followed_names, min(4, len(followed_names))) if followed_names else []
serendipity = [f"best {top_cats[0]} channels"] if top_cats else []
search_queries = list(dict.fromkeys(top_tags + sampled_names + serendipity + top_cats[:2]))[:10]
for q in search_queries:
result.append((q, "search", 5.0))
# Liked signal queries
liked_rows = db.execute(
text("""
SELECT v.tags FROM user_videos uv
JOIN videos v ON uv.video_id = v.id
WHERE uv.user_id = :user_id AND uv.liked = 1
AND v.tags IS NOT NULL AND v.tags != '' AND v.tags != '[]'
"""),
{"user_id": user_id},
).mappings().all()
for row in liked_rows:
try:
for tag in json.loads(row["tags"]):
if isinstance(tag, str):
t = tag.lower().strip()
if 3 <= len(t) <= 40:
liked_tag_counts[t] = liked_tag_counts.get(t, 0) + 2
except (json.JSONDecodeError, TypeError):
pass
for q in [t for t, _ in sorted(liked_tag_counts.items(), key=lambda x: -x[1])[:4]]:
result.append((q, "liked", 10.0))
return result
def _sample_graph_yt_ids(db: Session, user_id: int) -> list[str]:
rows = db.execute(
text("""
SELECT c.youtube_channel_id
FROM channels c
JOIN user_channels uc ON c.id = uc.channel_id
WHERE uc.user_id = :user_id AND uc.status = 'followed'
AND c.youtube_channel_id IS NOT NULL
"""),
{"user_id": user_id},
).scalars().all()
if not rows:
return []
return random.sample(list(rows), min(6, len(rows)))
def schedule_discovery(user_id: int, regions: list[str] | None = None):
"""Schedule a full discovery sweep, spreading yt-dlp calls 30-90 s apart
with call types shuffled so searches, graph fetches, and trending are mixed."""
if regions is None:
regions = ["US", "SE"]
from ..database import SessionLocal
# Fast signals (pure SQL, no yt-dlp) run synchronously right now
db = SessionLocal()
try:
db.execute(
text("""
DELETE FROM discovery_queue
WHERE user_id = :uid AND seen = 0
AND created_at <= datetime('now', '-14 days')
"""),
{"uid": user_id},
)
db.commit()
update_community_signal(db, user_id)
update_category_clusters(db, user_id)
search_args = _build_search_task_args(db, user_id)
graph_yt_ids = _sample_graph_yt_ids(db, user_id)
finally:
db.close()
# Build one task per yt-dlp call, then shuffle to mix call types
tasks: list[tuple[int, object]] = []
for query, source, mult in search_args:
tasks.append((user_id, lambda q=query, s=source, m=mult: _do_task_search(user_id, q, s, m)))
for region in regions[:1]:
tasks.append((user_id, lambda r=region: _do_task_trending(user_id, r)))
for yt_id in graph_yt_ids:
tasks.append((user_id, lambda y=yt_id: _do_task_graph(user_id, y)))
random.shuffle(tasks)
with _progress_lock:
_progress[user_id] = {"total": len(tasks), "done": 0, "running": bool(tasks)}
for item in tasks:
_task_queue.put(item)
if not tasks:
_stamp_last_run(user_id)