Route all discovery fetches through global rate limiter
- search_youtube, fetch_trending, fetch_featured_channels now use _meta_run - Replaced ThreadPoolExecutor(4) parallel searches with sequential loop - Replaced ThreadPoolExecutor(3) parallel featured-channel fetches with sequential - _fetch_and_index_channel passes polite=True to fetch_channel/video_metadata Discovery was firing 4+ simultaneous yt-dlp processes, each with cookies, which is what invalidated the session. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,6 @@
|
|||||||
"""Discovery engine — search-based crawl, trending, community signal, category clustering."""
|
"""Discovery engine — search-based crawl, trending, community signal, category clustering."""
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
from sqlalchemy import text
|
from sqlalchemy import text
|
||||||
@@ -13,7 +12,7 @@ from . import ytdlp
|
|||||||
def _fetch_and_index_channel(db: Session, channel: Channel):
|
def _fetch_and_index_channel(db: Session, channel: Channel):
|
||||||
"""Fetch full metadata + recent videos for a discovered channel."""
|
"""Fetch full metadata + recent videos for a discovered channel."""
|
||||||
try:
|
try:
|
||||||
result = ytdlp.fetch_channel_metadata(channel.youtube_channel_id, max_videos=10)
|
result = ytdlp.fetch_channel_metadata(channel.youtube_channel_id, max_videos=10, polite=True)
|
||||||
if not result:
|
if not result:
|
||||||
return
|
return
|
||||||
ch_data = result.get("channel", {})
|
ch_data = result.get("channel", {})
|
||||||
@@ -33,7 +32,7 @@ def _fetch_and_index_channel(db: Session, channel: Channel):
|
|||||||
if not yt_id:
|
if not yt_id:
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
meta = ytdlp.fetch_video_metadata(yt_id)
|
meta = ytdlp.fetch_video_metadata(yt_id, polite=True)
|
||||||
if meta and meta.get("published_at"):
|
if meta and meta.get("published_at"):
|
||||||
individual_fetched[yt_id] = meta
|
individual_fetched[yt_id] = meta
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -106,29 +105,25 @@ def _search_and_store(
|
|||||||
"""Run YouTube searches for the given queries and add results to discovery."""
|
"""Run YouTube searches for the given queries and add results to discovery."""
|
||||||
discovered: dict[str, dict] = {}
|
discovered: dict[str, dict] = {}
|
||||||
|
|
||||||
def _do_search(query: str) -> list[dict]:
|
for query in queries:
|
||||||
try:
|
try:
|
||||||
return ytdlp.search_youtube(query, max_results=40)
|
results = ytdlp.search_youtube(query, max_results=40)
|
||||||
except Exception:
|
except Exception:
|
||||||
return []
|
results = []
|
||||||
|
for video in results:
|
||||||
with ThreadPoolExecutor(max_workers=4) as pool:
|
ch = video.get("channel", {})
|
||||||
futures = {pool.submit(_do_search, q): q for q in queries}
|
yt_id = ch.get("youtube_channel_id")
|
||||||
for fut in as_completed(futures):
|
name = (ch.get("name") or "").strip()
|
||||||
for video in fut.result():
|
if yt_id and name and yt_id not in followed_yt_ids:
|
||||||
ch = video.get("channel", {})
|
if yt_id not in discovered:
|
||||||
yt_id = ch.get("youtube_channel_id")
|
discovered[yt_id] = {"name": name, "count": 0, "previews": []}
|
||||||
name = (ch.get("name") or "").strip()
|
discovered[yt_id]["count"] += 1
|
||||||
if yt_id and name and yt_id not in followed_yt_ids:
|
previews = discovered[yt_id]["previews"]
|
||||||
if yt_id not in discovered:
|
if len(previews) < 3 and video.get("thumbnail_url") and video.get("title"):
|
||||||
discovered[yt_id] = {"name": name, "count": 0, "previews": []}
|
previews.append({
|
||||||
discovered[yt_id]["count"] += 1
|
"thumbnail_url": video["thumbnail_url"],
|
||||||
previews = discovered[yt_id]["previews"]
|
"title": video["title"],
|
||||||
if len(previews) < 3 and video.get("thumbnail_url") and video.get("title"):
|
})
|
||||||
previews.append({
|
|
||||||
"thumbnail_url": video["thumbnail_url"],
|
|
||||||
"title": video["title"],
|
|
||||||
})
|
|
||||||
|
|
||||||
if not discovered:
|
if not discovered:
|
||||||
return
|
return
|
||||||
@@ -613,18 +608,12 @@ def update_graph_signal(db: Session, user_id: int):
|
|||||||
|
|
||||||
sample = random.sample(list(followed_rows), min(12, len(followed_rows)))
|
sample = random.sample(list(followed_rows), min(12, len(followed_rows)))
|
||||||
|
|
||||||
def _fetch(yt_id: str) -> list[str]:
|
|
||||||
try:
|
|
||||||
return ytdlp.fetch_featured_channels(yt_id)
|
|
||||||
except Exception:
|
|
||||||
return []
|
|
||||||
|
|
||||||
featured_map: dict[str, list[str]] = {}
|
featured_map: dict[str, list[str]] = {}
|
||||||
with ThreadPoolExecutor(max_workers=3) as pool:
|
for row in sample:
|
||||||
futures = {pool.submit(_fetch, row["youtube_channel_id"]): row for row in sample}
|
try:
|
||||||
for fut in as_completed(futures):
|
featured_map[row["youtube_channel_id"]] = ytdlp.fetch_featured_channels(row["youtube_channel_id"])
|
||||||
row = futures[fut]
|
except Exception:
|
||||||
featured_map[row["youtube_channel_id"]] = fut.result()
|
featured_map[row["youtube_channel_id"]] = []
|
||||||
|
|
||||||
needs_indexing: list[int] = []
|
needs_indexing: list[int] = []
|
||||||
for source_yt_id, channel_ids in featured_map.items():
|
for source_yt_id, channel_ids in featured_map.items():
|
||||||
|
|||||||
@@ -152,7 +152,7 @@ def _normalize_channel(info: dict) -> dict:
|
|||||||
|
|
||||||
def search_youtube(query: str, max_results: int = 40) -> list[dict]:
|
def search_youtube(query: str, max_results: int = 40) -> list[dict]:
|
||||||
"""Search YouTube via yt-dlp. Uses --flat-playlist for fast results."""
|
"""Search YouTube via yt-dlp. Uses --flat-playlist for fast results."""
|
||||||
stdout, _, code = _run([
|
stdout, _, code = _meta_run([
|
||||||
"yt-dlp",
|
"yt-dlp",
|
||||||
f"ytsearch{max_results}:{query}",
|
f"ytsearch{max_results}:{query}",
|
||||||
"--dump-json",
|
"--dump-json",
|
||||||
@@ -199,7 +199,7 @@ def fetch_trending(region: str = "US", max_results: int = 50) -> list[dict]:
|
|||||||
region = region.upper()
|
region = region.upper()
|
||||||
# CAI%3D = sort by upload date; gl= sets the region
|
# CAI%3D = sort by upload date; gl= sets the region
|
||||||
url = f"https://www.youtube.com/results?search_query=trending&sp=CAI%253D&gl={region}"
|
url = f"https://www.youtube.com/results?search_query=trending&sp=CAI%253D&gl={region}"
|
||||||
stdout, _, code = _run([
|
stdout, _, code = _meta_run([
|
||||||
"yt-dlp",
|
"yt-dlp",
|
||||||
url,
|
url,
|
||||||
"--dump-json",
|
"--dump-json",
|
||||||
@@ -474,7 +474,7 @@ def fetch_featured_channels(channel_id: str) -> list[str]:
|
|||||||
url = f"https://www.youtube.com/{channel_id}/channels"
|
url = f"https://www.youtube.com/{channel_id}/channels"
|
||||||
else:
|
else:
|
||||||
url = f"https://www.youtube.com/channel/{channel_id}/channels"
|
url = f"https://www.youtube.com/channel/{channel_id}/channels"
|
||||||
stdout, _, code = _run([
|
stdout, _, code = _meta_run([
|
||||||
"yt-dlp", url,
|
"yt-dlp", url,
|
||||||
"--dump-json",
|
"--dump-json",
|
||||||
"--flat-playlist",
|
"--flat-playlist",
|
||||||
|
|||||||
Reference in New Issue
Block a user