Add global yt-dlp metadata rate limiter (5s + jitter between calls)

All fetch_video_metadata / fetch_channel_metadata / fetch_channel_playlists
/ fetch_available_subs calls now go through _meta_run which enforces a
minimum 5s gap (+ 0.5-2.5s random jitter) across all concurrent tasks.
Per-task sleep loops removed since the global lock serializes everything.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-27 00:31:14 +02:00
parent 15e6b94cbf
commit c180c293b0
2 changed files with 29 additions and 17 deletions

View File

@@ -80,11 +80,7 @@ def _get_channel_or_404(db: Session, channel_id: int) -> Channel:
def _index_channels_batch(channel_ids: list[int], user_id: int, delay: float = 1.5): def _index_channels_batch(channel_ids: list[int], user_id: int, delay: float = 1.5):
"""Run channel syncs sequentially with a polite delay between requests.""" for cid in channel_ids:
import time
for i, cid in enumerate(channel_ids):
if i > 0:
time.sleep(delay)
_index_channel_task(cid, user_id) _index_channel_task(cid, user_id)
@@ -208,9 +204,7 @@ def _enrich_missing_task(limit: int = 20):
"""), """),
{"limit": limit}, {"limit": limit},
).mappings().all() ).mappings().all()
for i, row in enumerate(rows): for row in rows:
if i > 0:
time.sleep(2.0)
try: try:
meta = ytdlp.fetch_video_metadata(row["youtube_video_id"]) meta = ytdlp.fetch_video_metadata(row["youtube_video_id"])
if meta: if meta:
@@ -824,9 +818,7 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
_tasks[task_id]["done"] = 0 _tasks[task_id]["done"] = 0
try: try:
for i, yt_id in enumerate(video_ids): for yt_id in video_ids:
if i > 0:
time.sleep(2.0)
try: try:
meta = ytdlp.fetch_video_metadata(yt_id) meta = ytdlp.fetch_video_metadata(yt_id)
if meta: if meta:

View File

@@ -1,8 +1,10 @@
"""Subprocess wrapper for yt-dlp.""" """Subprocess wrapper for yt-dlp."""
import json import json
import random
import re import re
import subprocess import subprocess
import threading import threading
import time
import urllib.request import urllib.request
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from datetime import datetime, timezone from datetime import datetime, timezone
@@ -17,6 +19,24 @@ def _run(args: list[str], timeout: int = 60) -> tuple[str, str, int]:
return result.stdout, result.stderr, result.returncode return result.stdout, result.stderr, result.returncode
# Global rate limiter for all metadata fetches — prevents concurrent tasks from
# hammering YouTube and invalidating cookies.
_meta_lock = threading.Lock()
_meta_last_call: float = 0.0
_META_MIN_GAP = 5.0 # seconds between any two metadata requests
def _meta_run(args: list[str], timeout: int = 60) -> tuple[str, str, int]:
global _meta_last_call
with _meta_lock:
now = time.monotonic()
wait = _META_MIN_GAP - (now - _meta_last_call)
if wait > 0:
time.sleep(wait + random.uniform(0.5, 2.5))
_meta_last_call = time.monotonic()
return _run(args, timeout=timeout)
def _parse_date(date_str: str | None) -> datetime | None: def _parse_date(date_str: str | None) -> datetime | None:
if not date_str: if not date_str:
return None return None
@@ -233,13 +253,13 @@ def fetch_video_metadata(video_id: str) -> dict | None:
"yt-dlp", url, "yt-dlp", url,
"--dump-json", "--no-download", "--no-playlist", "--dump-json", "--no-download", "--no-playlist",
] ]
stdout, stderr, code = _run([*base_cmd, *cookie_args], timeout=30) stdout, stderr, code = _meta_run([*base_cmd, *cookie_args], timeout=30)
if code != 0: if code != 0:
print(f"[fetch_meta] FAILED code={code} stderr={stderr[:500]!r}", flush=True) print(f"[fetch_meta] FAILED code={code} stderr={stderr[:500]!r}", flush=True)
# Retry without auth args — broken cookie config shouldn't block public videos # Retry without auth args — broken cookie config shouldn't block public videos
if cookie_args: if cookie_args:
print(f"[fetch_meta] retrying without cookie args", flush=True) print(f"[fetch_meta] retrying without cookie args", flush=True)
stdout, stderr, code = _run(base_cmd, timeout=30) stdout, stderr, code = _meta_run(base_cmd, timeout=30)
if code != 0: if code != 0:
print(f"[fetch_meta] retry also FAILED code={code}", flush=True) print(f"[fetch_meta] retry also FAILED code={code}", flush=True)
@@ -310,7 +330,7 @@ def fetch_channel_metadata(channel_id: str, max_videos: int = 30, start_video: i
end = (start_video - 1 + max_videos) if start_video > 1 else max_videos end = (start_video - 1 + max_videos) if start_video > 1 else max_videos
args += ["--playlist-end", str(end)] args += ["--playlist-end", str(end)]
stdout, _, code = _run(args, timeout=60) stdout, _, code = _meta_run(args, timeout=60)
if not stdout.strip(): if not stdout.strip():
return None return None
@@ -359,7 +379,7 @@ def fetch_channel_playlists(channel_id: str, max_results: int = 100) -> list[dic
url = f"https://www.youtube.com/{channel_id}/playlists" url = f"https://www.youtube.com/{channel_id}/playlists"
else: else:
url = f"https://www.youtube.com/channel/{channel_id}/playlists" url = f"https://www.youtube.com/channel/{channel_id}/playlists"
stdout, _, code = _run([ stdout, _, code = _meta_run([
"yt-dlp", url, "yt-dlp", url,
"--dump-json", "--flat-playlist", "--dump-json", "--flat-playlist",
"--playlist-end", str(max_results), "--playlist-end", str(max_results),
@@ -552,9 +572,9 @@ def fetch_available_subs(video_id: str) -> dict:
url = f"https://www.youtube.com/watch?v={video_id}" url = f"https://www.youtube.com/watch?v={video_id}"
base_cmd = ["yt-dlp", url, "--dump-json", "--no-download", "--no-playlist"] base_cmd = ["yt-dlp", url, "--dump-json", "--no-download", "--no-playlist"]
cookie_args = _cookie_args() cookie_args = _cookie_args()
stdout, _, code = _run([*base_cmd, *cookie_args], timeout=30) stdout, _, code = _meta_run([*base_cmd, *cookie_args], timeout=30)
if code != 0 and cookie_args: if code != 0 and cookie_args:
stdout, _, code = _run(base_cmd, timeout=30) stdout, _, code = _meta_run(base_cmd, timeout=30)
for line in stdout.splitlines(): for line in stdout.splitlines():
line = line.strip() line = line.strip()