Slow down popular fetch and enrich to protect cookies
- Popular fetch phase 2: sequential with 2s delay between requests (was 3 parallel workers) - Reduced from 200 to 100 videos per popular fetch run - DB writes happen after each video instead of all at end (no data loss on interrupt) - _enrich_missing_task: delay increased 0.5s → 2s between requests Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -210,7 +210,7 @@ def _enrich_missing_task(limit: int = 20):
|
|||||||
).mappings().all()
|
).mappings().all()
|
||||||
for i, row in enumerate(rows):
|
for i, row in enumerate(rows):
|
||||||
if i > 0:
|
if i > 0:
|
||||||
time.sleep(0.5)
|
time.sleep(2.0)
|
||||||
try:
|
try:
|
||||||
meta = ytdlp.fetch_video_metadata(row["youtube_video_id"])
|
meta = ytdlp.fetch_video_metadata(row["youtube_video_id"])
|
||||||
if meta:
|
if meta:
|
||||||
@@ -723,13 +723,12 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
|
|||||||
Phase 1 (fast): flat-playlist crawl of the full channel → store any
|
Phase 1 (fast): flat-playlist crawl of the full channel → store any
|
||||||
new videos in DB (title, duration, thumbnail). No individual requests.
|
new videos in DB (title, duration, thumbnail). No individual requests.
|
||||||
|
|
||||||
Phase 2 (parallel): for every video now in DB for this channel,
|
Phase 2 (sequential, polite): fetch each video's watch page one at a time
|
||||||
fetch its watch page to get real view_count + published_at.
|
with a 2-second pause between requests to avoid cookie invalidation.
|
||||||
Prioritises videos missing view_count; caps at 200 per run.
|
Prioritises videos missing view_count; caps at 100 per run.
|
||||||
Popular tab then sorts by view_count DESC.
|
|
||||||
"""
|
"""
|
||||||
|
import time
|
||||||
from ..database import SessionLocal
|
from ..database import SessionLocal
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
||||||
|
|
||||||
task_id = f"popular-{channel_id}"
|
task_id = f"popular-{channel_id}"
|
||||||
label = f"Popular fetch — {channel_name}" if channel_name else "Popular fetch"
|
label = f"Popular fetch — {channel_name}" if channel_name else "Popular fetch"
|
||||||
@@ -797,7 +796,7 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
|
|||||||
finally:
|
finally:
|
||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
# Phase 2 — individual fetches for view_count, prioritising missing ones
|
# Phase 2 — sequential fetches with a polite delay to avoid cookie invalidation
|
||||||
db = SessionLocal()
|
db = SessionLocal()
|
||||||
try:
|
try:
|
||||||
rows = db.execute(
|
rows = db.execute(
|
||||||
@@ -805,7 +804,7 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
|
|||||||
SELECT youtube_video_id FROM videos
|
SELECT youtube_video_id FROM videos
|
||||||
WHERE channel_id = :cid
|
WHERE channel_id = :cid
|
||||||
ORDER BY (view_count IS NULL) DESC, RANDOM()
|
ORDER BY (view_count IS NULL) DESC, RANDOM()
|
||||||
LIMIT 200
|
LIMIT 100
|
||||||
"""),
|
"""),
|
||||||
{"cid": channel_id},
|
{"cid": channel_id},
|
||||||
).mappings().all()
|
).mappings().all()
|
||||||
@@ -824,28 +823,14 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
|
|||||||
_tasks[task_id]["total"] = len(video_ids)
|
_tasks[task_id]["total"] = len(video_ids)
|
||||||
_tasks[task_id]["done"] = 0
|
_tasks[task_id]["done"] = 0
|
||||||
|
|
||||||
results = {}
|
|
||||||
try:
|
try:
|
||||||
with ThreadPoolExecutor(max_workers=3) as pool:
|
for i, yt_id in enumerate(video_ids):
|
||||||
futures = {pool.submit(ytdlp.fetch_video_metadata, vid): vid for vid in video_ids}
|
if i > 0:
|
||||||
for future in as_completed(futures):
|
time.sleep(2.0)
|
||||||
vid = futures[future]
|
|
||||||
try:
|
try:
|
||||||
results[vid] = future.result()
|
meta = ytdlp.fetch_video_metadata(yt_id)
|
||||||
except Exception:
|
if meta:
|
||||||
pass
|
|
||||||
with _tasks_lock:
|
|
||||||
if task_id in _tasks:
|
|
||||||
_tasks[task_id]["done"] += 1
|
|
||||||
finally:
|
|
||||||
with _tasks_lock:
|
|
||||||
_tasks.pop(task_id, None)
|
|
||||||
|
|
||||||
db = SessionLocal()
|
db = SessionLocal()
|
||||||
try:
|
|
||||||
for yt_id, meta in results.items():
|
|
||||||
if not meta:
|
|
||||||
continue
|
|
||||||
try:
|
try:
|
||||||
vid = db.query(Video).filter_by(youtube_video_id=yt_id).first()
|
vid = db.query(Video).filter_by(youtube_video_id=yt_id).first()
|
||||||
if vid:
|
if vid:
|
||||||
@@ -858,6 +843,14 @@ def _fetch_popular_task(channel_id: int, youtube_channel_id: str, channel_name:
|
|||||||
db.rollback()
|
db.rollback()
|
||||||
finally:
|
finally:
|
||||||
db.close()
|
db.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
with _tasks_lock:
|
||||||
|
if task_id in _tasks:
|
||||||
|
_tasks[task_id]["done"] += 1
|
||||||
|
finally:
|
||||||
|
with _tasks_lock:
|
||||||
|
_tasks.pop(task_id, None)
|
||||||
|
|
||||||
|
|
||||||
@router.post("/{channel_id}/search", status_code=status.HTTP_202_ACCEPTED)
|
@router.post("/{channel_id}/search", status_code=status.HTTP_202_ACCEPTED)
|
||||||
|
|||||||
Reference in New Issue
Block a user