Slow down popular fetch and enrich to protect cookies

- Popular fetch phase 2: sequential with 2s delay between requests (was 3 parallel workers)
- Reduced from 200 to 100 videos per popular fetch run
- DB writes happen after each video instead of all at end (no data loss on interrupt)
- _enrich_missing_task: delay increased 0.5s → 2s between requests

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-26 23:53:37 +02:00
parent ff601d3585
commit 8029b2517f

View File

@@ -210,7 +210,7 @@ def _enrich_missing_task(limit: int = 20):
).mappings().all() ).mappings().all()
for i, row in enumerate(rows): for i, row in enumerate(rows):
if i > 0: if i > 0:
time.sleep(0.5) time.sleep(2.0)
try: try:
meta = ytdlp.fetch_video_metadata(row["youtube_video_id"]) meta = ytdlp.fetch_video_metadata(row["youtube_video_id"])
if meta: if meta:
@@ -723,13 +723,12 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
Phase 1 (fast): flat-playlist crawl of the full channel → store any Phase 1 (fast): flat-playlist crawl of the full channel → store any
new videos in DB (title, duration, thumbnail). No individual requests. new videos in DB (title, duration, thumbnail). No individual requests.
Phase 2 (parallel): for every video now in DB for this channel, Phase 2 (sequential, polite): fetch each video's watch page one at a time
fetch its watch page to get real view_count + published_at. with a 2-second pause between requests to avoid cookie invalidation.
Prioritises videos missing view_count; caps at 200 per run. Prioritises videos missing view_count; caps at 100 per run.
Popular tab then sorts by view_count DESC.
""" """
import time
from ..database import SessionLocal from ..database import SessionLocal
from concurrent.futures import ThreadPoolExecutor, as_completed
task_id = f"popular-{channel_id}" task_id = f"popular-{channel_id}"
label = f"Popular fetch — {channel_name}" if channel_name else "Popular fetch" label = f"Popular fetch — {channel_name}" if channel_name else "Popular fetch"
@@ -797,7 +796,7 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
finally: finally:
db.close() db.close()
# Phase 2 — individual fetches for view_count, prioritising missing ones # Phase 2 — sequential fetches with a polite delay to avoid cookie invalidation
db = SessionLocal() db = SessionLocal()
try: try:
rows = db.execute( rows = db.execute(
@@ -805,7 +804,7 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
SELECT youtube_video_id FROM videos SELECT youtube_video_id FROM videos
WHERE channel_id = :cid WHERE channel_id = :cid
ORDER BY (view_count IS NULL) DESC, RANDOM() ORDER BY (view_count IS NULL) DESC, RANDOM()
LIMIT 200 LIMIT 100
"""), """),
{"cid": channel_id}, {"cid": channel_id},
).mappings().all() ).mappings().all()
@@ -824,28 +823,14 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
_tasks[task_id]["total"] = len(video_ids) _tasks[task_id]["total"] = len(video_ids)
_tasks[task_id]["done"] = 0 _tasks[task_id]["done"] = 0
results = {}
try: try:
with ThreadPoolExecutor(max_workers=3) as pool: for i, yt_id in enumerate(video_ids):
futures = {pool.submit(ytdlp.fetch_video_metadata, vid): vid for vid in video_ids} if i > 0:
for future in as_completed(futures): time.sleep(2.0)
vid = futures[future]
try: try:
results[vid] = future.result() meta = ytdlp.fetch_video_metadata(yt_id)
except Exception: if meta:
pass
with _tasks_lock:
if task_id in _tasks:
_tasks[task_id]["done"] += 1
finally:
with _tasks_lock:
_tasks.pop(task_id, None)
db = SessionLocal() db = SessionLocal()
try:
for yt_id, meta in results.items():
if not meta:
continue
try: try:
vid = db.query(Video).filter_by(youtube_video_id=yt_id).first() vid = db.query(Video).filter_by(youtube_video_id=yt_id).first()
if vid: if vid:
@@ -858,6 +843,14 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
db.rollback() db.rollback()
finally: finally:
db.close() db.close()
except Exception:
pass
with _tasks_lock:
if task_id in _tasks:
_tasks[task_id]["done"] += 1
finally:
with _tasks_lock:
_tasks.pop(task_id, None)
@router.post("/{channel_id}/search", status_code=status.HTTP_202_ACCEPTED) @router.post("/{channel_id}/search", status_code=status.HTTP_202_ACCEPTED)