Add queue-based gradual discovery with shuffled call ordering and progress UI

Each yt-dlp call is now an independent task (one search query, one trending fetch, one graph channel fetch). Tasks are shuffled together so we don't fire 10 searches in a row, then enqueued with 30-90s random gaps between them — a full sweep of ~17 tasks completes in roughly 10-25 minutes instead of hammering YouTube with 21 calls back-to-back. Fast signals (community, category clusters) still run synchronously at schedule time since they're pure SQL. Progress is tracked per-user (total/done/running) and exposed on GET /api/discovery/status. The Discovery page polls every 10s while running and shows a progress bar + "Finding channels… X / Y" in the header. The auto-discovery daemon skips scheduling if a manual sweep is already running. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-27 02:28:35 +02:00
parent e6faf8e08e
commit a535e9f22a
4 changed files with 367 additions and 44 deletions
--- a/backend/main.py
+++ b/backend/main.py
@@ -175,9 +175,11 @@ def on_startup():
    finally:
        db.close()

-    # Backfill descriptions for videos that don't have them yet (runs in background)
+    # Start discovery worker and backfill enrichment
    import threading
    from .routers.channels import _enrich_missing_task, _index_channels_batch
+    from .services.discovery import start_discovery_worker
+    start_discovery_worker()
    threading.Thread(target=_enrich_missing_task, args=(50,), daemon=True).start()

    def _auto_sync_daemon():
@@ -222,7 +224,7 @@ def on_startup():
        import time as _time
        from datetime import datetime as _dt, timedelta as _td
        from sqlalchemy import text as _text
-        from .services.discovery import run_full_discovery
+        from .services.discovery import schedule_discovery, get_discovery_progress

        # Wait 5 minutes after startup before the first check so the app can
        # finish initialising and existing enrichment tasks can settle.
@@ -244,13 +246,12 @@ def on_startup():
                        last = row["last_discovery_run"]
                        if last is None or (_dt.utcnow() - _dt.fromisoformat(str(last))) > _td(hours=23):
                            uid = row["user_id"]
+                            # Skip if a manual sweep is already running
+                            prog = get_discovery_progress(uid)
+                            if prog and prog.get("running"):
+                                continue
                            regions = [r.strip().upper() for r in (row["discovery_regions"] or "US,SE").split(",") if r.strip()]
-                            run_full_discovery(db, uid, regions)
-                            db.execute(
-                                _text("UPDATE user_settings SET last_discovery_run = :now WHERE user_id = :uid"),
-                                {"now": _dt.utcnow(), "uid": uid},
-                            )
-                            db.commit()
+                            schedule_discovery(uid, regions)
                finally:
                    db.close()
            except Exception: