Add queue-based gradual discovery with shuffled call ordering and progress UI
Each yt-dlp call is now an independent task (one search query, one trending fetch, one graph channel fetch). Tasks are shuffled together so we don't fire 10 searches in a row, then enqueued with 30-90s random gaps between them — a full sweep of ~17 tasks completes in roughly 10-25 minutes instead of hammering YouTube with 21 calls back-to-back. Fast signals (community, category clusters) still run synchronously at schedule time since they're pure SQL. Progress is tracked per-user (total/done/running) and exposed on GET /api/discovery/status. The Discovery page polls every 10s while running and shows a progress bar + "Finding channels… X / Y" in the header. The auto-discovery daemon skips scheduling if a manual sweep is already running. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,9 @@
|
||||
"""Discovery engine — search-based crawl, trending, community signal, category clustering."""
|
||||
import json
|
||||
import queue as _queue
|
||||
import random
|
||||
import threading as _threading
|
||||
import time as _time
|
||||
from datetime import datetime
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import text
|
||||
@@ -8,6 +11,16 @@ from sqlalchemy import text
|
||||
from ..models import Channel, UserChannel, DiscoveryQueue, Video
|
||||
from . import ytdlp
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Background task queue — spaces yt-dlp calls 30-90 s apart and shuffles
|
||||
# call types so we don't fire 10 searches in a row.
|
||||
# ---------------------------------------------------------------------------
|
||||
_task_queue: _queue.Queue = _queue.Queue()
|
||||
_progress: dict[int, dict] = {} # user_id -> {total, done, running}
|
||||
_progress_lock = _threading.Lock()
|
||||
_worker_started = False
|
||||
_worker_lock = _threading.Lock()
|
||||
|
||||
|
||||
def _fetch_and_index_channel(db: Session, channel: Channel):
|
||||
"""Fetch full metadata + recent videos for a discovered channel."""
|
||||
@@ -680,3 +693,316 @@ def run_full_discovery(db: Session, user_id: int, regions: list[str] | None = No
|
||||
# update_watch_signal skipped — tags already included in crawl_by_search
|
||||
update_trending_signal(db, user_id, regions[:1]) # 1 yt-dlp call (first region only)
|
||||
update_graph_signal(db, user_id) # ~6 yt-dlp calls
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Queue-based gradual discovery — each yt-dlp call is its own task, shuffled
|
||||
# so call types are mixed, with 30-90 s gaps between them.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _get_followed_yt_ids(db: Session, user_id: int) -> set[str]:
|
||||
return set(db.execute(
|
||||
text("""
|
||||
SELECT c.youtube_channel_id FROM channels c
|
||||
JOIN user_channels uc ON c.id = uc.channel_id
|
||||
WHERE uc.user_id = :uid AND uc.status = 'followed'
|
||||
"""),
|
||||
{"uid": user_id},
|
||||
).scalars().all())
|
||||
|
||||
|
||||
def _get_neg_tags(db: Session, user_id: int) -> frozenset[str]:
|
||||
return frozenset(db.execute(
|
||||
text("SELECT tag FROM user_tag_affinity WHERE user_id = :uid AND score < -2"),
|
||||
{"uid": user_id},
|
||||
).scalars().all())
|
||||
|
||||
|
||||
def _stamp_last_run(user_id: int):
|
||||
from ..database import SessionLocal
|
||||
from sqlalchemy import text as _text
|
||||
db = SessionLocal()
|
||||
try:
|
||||
db.execute(
|
||||
_text("UPDATE user_settings SET last_discovery_run = :now WHERE user_id = :uid"),
|
||||
{"now": datetime.utcnow(), "uid": user_id},
|
||||
)
|
||||
db.commit()
|
||||
except Exception:
|
||||
db.rollback()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def _do_task_search(user_id: int, query: str, source: str, score_multiplier: float):
|
||||
from ..database import SessionLocal
|
||||
db = SessionLocal()
|
||||
try:
|
||||
followed_yt_ids = _get_followed_yt_ids(db, user_id)
|
||||
neg_tags = _get_neg_tags(db, user_id)
|
||||
_search_and_store(db, user_id, [query], followed_yt_ids, score_multiplier, source, neg_tags)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def _do_task_trending(user_id: int, region: str):
|
||||
from ..database import SessionLocal
|
||||
db = SessionLocal()
|
||||
try:
|
||||
update_trending_signal(db, user_id, [region])
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def _fetch_graph_for_channel(db: Session, user_id: int, source_yt_id: str):
|
||||
"""Fetch featured channels for one followed channel and add to discovery queue."""
|
||||
followed_yt_ids = _get_followed_yt_ids(db, user_id)
|
||||
dismissed_ids = set(db.execute(
|
||||
text("SELECT channel_id FROM user_channels WHERE user_id = :uid AND status = 'dismissed'"),
|
||||
{"uid": user_id},
|
||||
).scalars().all())
|
||||
|
||||
try:
|
||||
featured = ytdlp.fetch_featured_channels(source_yt_id)
|
||||
except Exception:
|
||||
return
|
||||
|
||||
needs_indexing: list[int] = []
|
||||
for yt_id in featured:
|
||||
if yt_id in followed_yt_ids:
|
||||
continue
|
||||
channel = db.query(Channel).filter_by(youtube_channel_id=yt_id).first()
|
||||
is_new = channel is None
|
||||
if not channel:
|
||||
channel = Channel(youtube_channel_id=yt_id, name="", description="", thumbnail_url=None)
|
||||
db.add(channel)
|
||||
db.flush()
|
||||
if channel.id in dismissed_ids:
|
||||
continue
|
||||
uc = db.query(UserChannel).filter_by(user_id=user_id, channel_id=channel.id).first()
|
||||
if uc and uc.status in ("followed", "dismissed"):
|
||||
continue
|
||||
_add_to_discovery(db, user_id, channel.id, score=8.0, source="graph")
|
||||
if is_new or not channel.crawled_at:
|
||||
needs_indexing.append(channel.id)
|
||||
|
||||
db.commit()
|
||||
|
||||
for channel_id in needs_indexing[:3]:
|
||||
ch = db.query(Channel).filter_by(id=channel_id).first()
|
||||
if ch:
|
||||
_fetch_and_index_channel(db, ch)
|
||||
|
||||
|
||||
def _do_task_graph(user_id: int, source_yt_id: str):
|
||||
from ..database import SessionLocal
|
||||
db = SessionLocal()
|
||||
try:
|
||||
_fetch_graph_for_channel(db, user_id, source_yt_id)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def _worker_loop():
|
||||
while True:
|
||||
try:
|
||||
user_id, task = _task_queue.get(timeout=10)
|
||||
except _queue.Empty:
|
||||
continue
|
||||
|
||||
try:
|
||||
task()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
with _progress_lock:
|
||||
p = _progress.get(user_id)
|
||||
if p:
|
||||
p["done"] = min(p["done"] + 1, p["total"])
|
||||
if p["done"] >= p["total"] and p["running"]:
|
||||
p["running"] = False
|
||||
_threading.Thread(target=_stamp_last_run, args=(user_id,), daemon=True).start()
|
||||
|
||||
_task_queue.task_done()
|
||||
|
||||
# Polite gap — only sleep if more tasks are waiting
|
||||
if not _task_queue.empty():
|
||||
_time.sleep(random.uniform(30, 90))
|
||||
|
||||
|
||||
def start_discovery_worker():
|
||||
"""Start the singleton background worker thread (idempotent)."""
|
||||
global _worker_started
|
||||
with _worker_lock:
|
||||
if not _worker_started:
|
||||
_threading.Thread(target=_worker_loop, daemon=True, name="discovery-worker").start()
|
||||
_worker_started = True
|
||||
|
||||
|
||||
def get_discovery_progress(user_id: int) -> dict | None:
|
||||
with _progress_lock:
|
||||
p = _progress.get(user_id)
|
||||
return dict(p) if p is not None else None
|
||||
|
||||
|
||||
def _build_search_task_args(db: Session, user_id: int) -> list[tuple[str, str, float]]:
|
||||
"""Compute all search/liked query strings without executing any yt-dlp calls."""
|
||||
result: list[tuple[str, str, float]] = []
|
||||
|
||||
followed_rows = db.execute(
|
||||
text("""
|
||||
SELECT c.name, c.youtube_channel_id
|
||||
FROM channels c
|
||||
JOIN user_channels uc ON c.id = uc.channel_id
|
||||
WHERE uc.user_id = :user_id AND uc.status = 'followed'
|
||||
"""),
|
||||
{"user_id": user_id},
|
||||
).mappings().all()
|
||||
|
||||
followed_names = [row["name"] for row in followed_rows if row["name"]]
|
||||
|
||||
tag_rows = db.execute(
|
||||
text("""
|
||||
SELECT tags FROM (
|
||||
SELECT v.tags FROM videos v
|
||||
JOIN user_channels uc ON v.channel_id = uc.channel_id
|
||||
WHERE uc.user_id = :user_id AND uc.status = 'followed'
|
||||
AND v.tags IS NOT NULL AND v.tags != '' AND v.tags != '[]'
|
||||
LIMIT 300
|
||||
)
|
||||
UNION ALL
|
||||
SELECT tags FROM (
|
||||
SELECT v.tags FROM user_videos uv
|
||||
JOIN videos v ON uv.video_id = v.id
|
||||
WHERE uv.user_id = :user_id AND uv.liked = 1
|
||||
AND v.tags IS NOT NULL AND v.tags != '' AND v.tags != '[]'
|
||||
LIMIT 100
|
||||
)
|
||||
"""),
|
||||
{"user_id": user_id},
|
||||
).mappings().all()
|
||||
|
||||
tag_counts: dict[str, int] = {}
|
||||
liked_tag_counts: dict[str, int] = {}
|
||||
for row in tag_rows:
|
||||
try:
|
||||
for tag in json.loads(row["tags"]):
|
||||
if isinstance(tag, str):
|
||||
t = tag.lower().strip()
|
||||
if 3 <= len(t) <= 40:
|
||||
tag_counts[t] = tag_counts.get(t, 0) + 1
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
continue
|
||||
|
||||
cat_rows = db.execute(
|
||||
text("""
|
||||
SELECT v.category, COUNT(*) AS cnt
|
||||
FROM videos v
|
||||
JOIN user_channels uc ON v.channel_id = uc.channel_id
|
||||
WHERE uc.user_id = :user_id AND uc.status = 'followed'
|
||||
AND v.category IS NOT NULL
|
||||
GROUP BY v.category
|
||||
ORDER BY cnt DESC
|
||||
LIMIT 5
|
||||
"""),
|
||||
{"user_id": user_id},
|
||||
).mappings().all()
|
||||
|
||||
top_tags = [t for t, _ in sorted(tag_counts.items(), key=lambda x: -x[1])[:5]]
|
||||
top_cats = [r["category"] for r in cat_rows]
|
||||
sampled_names = random.sample(followed_names, min(4, len(followed_names))) if followed_names else []
|
||||
serendipity = [f"best {top_cats[0]} channels"] if top_cats else []
|
||||
search_queries = list(dict.fromkeys(top_tags + sampled_names + serendipity + top_cats[:2]))[:10]
|
||||
for q in search_queries:
|
||||
result.append((q, "search", 5.0))
|
||||
|
||||
# Liked signal queries
|
||||
liked_rows = db.execute(
|
||||
text("""
|
||||
SELECT v.tags FROM user_videos uv
|
||||
JOIN videos v ON uv.video_id = v.id
|
||||
WHERE uv.user_id = :user_id AND uv.liked = 1
|
||||
AND v.tags IS NOT NULL AND v.tags != '' AND v.tags != '[]'
|
||||
"""),
|
||||
{"user_id": user_id},
|
||||
).mappings().all()
|
||||
|
||||
for row in liked_rows:
|
||||
try:
|
||||
for tag in json.loads(row["tags"]):
|
||||
if isinstance(tag, str):
|
||||
t = tag.lower().strip()
|
||||
if 3 <= len(t) <= 40:
|
||||
liked_tag_counts[t] = liked_tag_counts.get(t, 0) + 2
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
|
||||
for q in [t for t, _ in sorted(liked_tag_counts.items(), key=lambda x: -x[1])[:4]]:
|
||||
result.append((q, "liked", 10.0))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _sample_graph_yt_ids(db: Session, user_id: int) -> list[str]:
|
||||
rows = db.execute(
|
||||
text("""
|
||||
SELECT c.youtube_channel_id
|
||||
FROM channels c
|
||||
JOIN user_channels uc ON c.id = uc.channel_id
|
||||
WHERE uc.user_id = :user_id AND uc.status = 'followed'
|
||||
AND c.youtube_channel_id IS NOT NULL
|
||||
"""),
|
||||
{"user_id": user_id},
|
||||
).scalars().all()
|
||||
if not rows:
|
||||
return []
|
||||
return random.sample(list(rows), min(6, len(rows)))
|
||||
|
||||
|
||||
def schedule_discovery(user_id: int, regions: list[str] | None = None):
|
||||
"""Schedule a full discovery sweep, spreading yt-dlp calls 30-90 s apart
|
||||
with call types shuffled so searches, graph fetches, and trending are mixed."""
|
||||
if regions is None:
|
||||
regions = ["US", "SE"]
|
||||
|
||||
from ..database import SessionLocal
|
||||
|
||||
# Fast signals (pure SQL, no yt-dlp) run synchronously right now
|
||||
db = SessionLocal()
|
||||
try:
|
||||
db.execute(
|
||||
text("""
|
||||
DELETE FROM discovery_queue
|
||||
WHERE user_id = :uid AND seen = 0
|
||||
AND created_at <= datetime('now', '-14 days')
|
||||
"""),
|
||||
{"uid": user_id},
|
||||
)
|
||||
db.commit()
|
||||
update_community_signal(db, user_id)
|
||||
update_category_clusters(db, user_id)
|
||||
|
||||
search_args = _build_search_task_args(db, user_id)
|
||||
graph_yt_ids = _sample_graph_yt_ids(db, user_id)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
# Build one task per yt-dlp call, then shuffle to mix call types
|
||||
tasks: list[tuple[int, object]] = []
|
||||
for query, source, mult in search_args:
|
||||
tasks.append((user_id, lambda q=query, s=source, m=mult: _do_task_search(user_id, q, s, m)))
|
||||
for region in regions[:1]:
|
||||
tasks.append((user_id, lambda r=region: _do_task_trending(user_id, r)))
|
||||
for yt_id in graph_yt_ids:
|
||||
tasks.append((user_id, lambda y=yt_id: _do_task_graph(user_id, y)))
|
||||
|
||||
random.shuffle(tasks)
|
||||
|
||||
with _progress_lock:
|
||||
_progress[user_id] = {"total": len(tasks), "done": 0, "running": bool(tasks)}
|
||||
|
||||
for item in tasks:
|
||||
_task_queue.put(item)
|
||||
|
||||
if not tasks:
|
||||
_stamp_last_run(user_id)
|
||||
|
||||
Reference in New Issue
Block a user