diff --git a/backend/services/discovery.py b/backend/services/discovery.py index 02e20b5..36de4a2 100644 --- a/backend/services/discovery.py +++ b/backend/services/discovery.py @@ -1,7 +1,6 @@ """Discovery engine — search-based crawl, trending, community signal, category clustering.""" import json import random -from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime from sqlalchemy.orm import Session from sqlalchemy import text @@ -13,7 +12,7 @@ from . import ytdlp def _fetch_and_index_channel(db: Session, channel: Channel): """Fetch full metadata + recent videos for a discovered channel.""" try: - result = ytdlp.fetch_channel_metadata(channel.youtube_channel_id, max_videos=10) + result = ytdlp.fetch_channel_metadata(channel.youtube_channel_id, max_videos=10, polite=True) if not result: return ch_data = result.get("channel", {}) @@ -33,7 +32,7 @@ def _fetch_and_index_channel(db: Session, channel: Channel): if not yt_id: continue try: - meta = ytdlp.fetch_video_metadata(yt_id) + meta = ytdlp.fetch_video_metadata(yt_id, polite=True) if meta and meta.get("published_at"): individual_fetched[yt_id] = meta except Exception: @@ -106,29 +105,25 @@ def _search_and_store( """Run YouTube searches for the given queries and add results to discovery.""" discovered: dict[str, dict] = {} - def _do_search(query: str) -> list[dict]: + for query in queries: try: - return ytdlp.search_youtube(query, max_results=40) + results = ytdlp.search_youtube(query, max_results=40) except Exception: - return [] - - with ThreadPoolExecutor(max_workers=4) as pool: - futures = {pool.submit(_do_search, q): q for q in queries} - for fut in as_completed(futures): - for video in fut.result(): - ch = video.get("channel", {}) - yt_id = ch.get("youtube_channel_id") - name = (ch.get("name") or "").strip() - if yt_id and name and yt_id not in followed_yt_ids: - if yt_id not in discovered: - discovered[yt_id] = {"name": name, "count": 0, "previews": []} - discovered[yt_id]["count"] += 1 - previews = discovered[yt_id]["previews"] - if len(previews) < 3 and video.get("thumbnail_url") and video.get("title"): - previews.append({ - "thumbnail_url": video["thumbnail_url"], - "title": video["title"], - }) + results = [] + for video in results: + ch = video.get("channel", {}) + yt_id = ch.get("youtube_channel_id") + name = (ch.get("name") or "").strip() + if yt_id and name and yt_id not in followed_yt_ids: + if yt_id not in discovered: + discovered[yt_id] = {"name": name, "count": 0, "previews": []} + discovered[yt_id]["count"] += 1 + previews = discovered[yt_id]["previews"] + if len(previews) < 3 and video.get("thumbnail_url") and video.get("title"): + previews.append({ + "thumbnail_url": video["thumbnail_url"], + "title": video["title"], + }) if not discovered: return @@ -613,18 +608,12 @@ def update_graph_signal(db: Session, user_id: int): sample = random.sample(list(followed_rows), min(12, len(followed_rows))) - def _fetch(yt_id: str) -> list[str]: - try: - return ytdlp.fetch_featured_channels(yt_id) - except Exception: - return [] - featured_map: dict[str, list[str]] = {} - with ThreadPoolExecutor(max_workers=3) as pool: - futures = {pool.submit(_fetch, row["youtube_channel_id"]): row for row in sample} - for fut in as_completed(futures): - row = futures[fut] - featured_map[row["youtube_channel_id"]] = fut.result() + for row in sample: + try: + featured_map[row["youtube_channel_id"]] = ytdlp.fetch_featured_channels(row["youtube_channel_id"]) + except Exception: + featured_map[row["youtube_channel_id"]] = [] needs_indexing: list[int] = [] for source_yt_id, channel_ids in featured_map.items(): diff --git a/backend/services/ytdlp.py b/backend/services/ytdlp.py index b131959..67cf041 100644 --- a/backend/services/ytdlp.py +++ b/backend/services/ytdlp.py @@ -152,7 +152,7 @@ def _normalize_channel(info: dict) -> dict: def search_youtube(query: str, max_results: int = 40) -> list[dict]: """Search YouTube via yt-dlp. Uses --flat-playlist for fast results.""" - stdout, _, code = _run([ + stdout, _, code = _meta_run([ "yt-dlp", f"ytsearch{max_results}:{query}", "--dump-json", @@ -199,7 +199,7 @@ def fetch_trending(region: str = "US", max_results: int = 50) -> list[dict]: region = region.upper() # CAI%3D = sort by upload date; gl= sets the region url = f"https://www.youtube.com/results?search_query=trending&sp=CAI%253D&gl={region}" - stdout, _, code = _run([ + stdout, _, code = _meta_run([ "yt-dlp", url, "--dump-json", @@ -474,7 +474,7 @@ def fetch_featured_channels(channel_id: str) -> list[str]: url = f"https://www.youtube.com/{channel_id}/channels" else: url = f"https://www.youtube.com/channel/{channel_id}/channels" - stdout, _, code = _run([ + stdout, _, code = _meta_run([ "yt-dlp", url, "--dump-json", "--flat-playlist",