Slow down popular fetch and enrich to protect cookies

- Popular fetch phase 2: sequential with 2s delay between requests (was 3 parallel workers)
- Reduced from 200 to 100 videos per popular fetch run
- DB writes happen after each video instead of all at end (no data loss on interrupt)
- _enrich_missing_task: delay increased 0.5s → 2s between requests

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-26 23:53:37 +02:00
parent ff601d3585
commit 8029b2517f

View File

@@ -210,7 +210,7 @@ def _enrich_missing_task(limit: int = 20):
).mappings().all()
for i, row in enumerate(rows):
if i > 0:
time.sleep(0.5)
time.sleep(2.0)
try:
meta = ytdlp.fetch_video_metadata(row["youtube_video_id"])
if meta:
@@ -723,13 +723,12 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
Phase 1 (fast): flat-playlist crawl of the full channel → store any
new videos in DB (title, duration, thumbnail). No individual requests.
Phase 2 (parallel): for every video now in DB for this channel,
fetch its watch page to get real view_count + published_at.
Prioritises videos missing view_count; caps at 200 per run.
Popular tab then sorts by view_count DESC.
Phase 2 (sequential, polite): fetch each video's watch page one at a time
with a 2-second pause between requests to avoid cookie invalidation.
Prioritises videos missing view_count; caps at 100 per run.
"""
import time
from ..database import SessionLocal
from concurrent.futures import ThreadPoolExecutor, as_completed
task_id = f"popular-{channel_id}"
label = f"Popular fetch — {channel_name}" if channel_name else "Popular fetch"
@@ -797,7 +796,7 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
finally:
db.close()
# Phase 2 — individual fetches for view_count, prioritising missing ones
# Phase 2 — sequential fetches with a polite delay to avoid cookie invalidation
db = SessionLocal()
try:
rows = db.execute(
@@ -805,7 +804,7 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
SELECT youtube_video_id FROM videos
WHERE channel_id = :cid
ORDER BY (view_count IS NULL) DESC, RANDOM()
LIMIT 200
LIMIT 100
"""),
{"cid": channel_id},
).mappings().all()
@@ -824,41 +823,35 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
_tasks[task_id]["total"] = len(video_ids)
_tasks[task_id]["done"] = 0
results = {}
try:
with ThreadPoolExecutor(max_workers=3) as pool:
futures = {pool.submit(ytdlp.fetch_video_metadata, vid): vid for vid in video_ids}
for future in as_completed(futures):
vid = futures[future]
try:
results[vid] = future.result()
except Exception:
pass
with _tasks_lock:
if task_id in _tasks:
_tasks[task_id]["done"] += 1
for i, yt_id in enumerate(video_ids):
if i > 0:
time.sleep(2.0)
try:
meta = ytdlp.fetch_video_metadata(yt_id)
if meta:
db = SessionLocal()
try:
vid = db.query(Video).filter_by(youtube_video_id=yt_id).first()
if vid:
if meta.get("view_count") is not None:
vid.view_count = meta["view_count"]
if not vid.published_at and meta.get("published_at"):
vid.published_at = meta["published_at"]
db.commit()
except Exception:
db.rollback()
finally:
db.close()
except Exception:
pass
with _tasks_lock:
if task_id in _tasks:
_tasks[task_id]["done"] += 1
finally:
with _tasks_lock:
_tasks.pop(task_id, None)
db = SessionLocal()
try:
for yt_id, meta in results.items():
if not meta:
continue
try:
vid = db.query(Video).filter_by(youtube_video_id=yt_id).first()
if vid:
if meta.get("view_count") is not None:
vid.view_count = meta["view_count"]
if not vid.published_at and meta.get("published_at"):
vid.published_at = meta["published_at"]
db.commit()
except Exception:
db.rollback()
finally:
db.close()
@router.post("/{channel_id}/search", status_code=status.HTTP_202_ACCEPTED)
def search_channel_youtube(