Fetch popular: flat-playlist crawl then parallel view_count enrichment
Phase 1: crawl the full channel with flat-playlist to store any videos not yet in DB (fast, no individual requests). Phase 2: fetch real view_count for up to 200 channel videos in parallel (8 workers), prioritising those missing a count. Popular tab sorts all channel videos by view_count DESC NULLS LAST. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -657,19 +657,75 @@ def fetch_popular_videos(
|
|||||||
|
|
||||||
|
|
||||||
def _fetch_popular_task(channel_id: int, youtube_channel_id: str):
|
def _fetch_popular_task(channel_id: int, youtube_channel_id: str):
|
||||||
"""Enrich indexed videos with view_count so Popular tab can rank them.
|
"""Half-and-half popular fetch.
|
||||||
|
|
||||||
YouTube's ?sort=p is broken in yt-dlp (their own tests mark it skipped).
|
Phase 1 (fast): flat-playlist crawl of the full channel → store any
|
||||||
Instead we fetch real view counts for all indexed videos via individual
|
new videos in DB (title, duration, thumbnail). No individual requests.
|
||||||
page requests and sort by view_count DESC locally.
|
|
||||||
|
Phase 2 (parallel): for every video now in DB for this channel,
|
||||||
|
fetch its watch page to get real view_count + published_at.
|
||||||
Prioritises videos missing view_count; caps at 200 per run.
|
Prioritises videos missing view_count; caps at 200 per run.
|
||||||
|
Popular tab then sorts by view_count DESC.
|
||||||
"""
|
"""
|
||||||
from ..database import SessionLocal
|
from ..database import SessionLocal
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
# Phase 1 — flat-playlist: crawl all channel videos quickly
|
||||||
|
if youtube_channel_id.startswith("@"):
|
||||||
|
url = f"https://www.youtube.com/{youtube_channel_id}/videos"
|
||||||
|
else:
|
||||||
|
url = f"https://www.youtube.com/channel/{youtube_channel_id}/videos"
|
||||||
|
|
||||||
|
stdout, _, _ = ytdlp._run([
|
||||||
|
"yt-dlp", url,
|
||||||
|
"--dump-json", "--flat-playlist",
|
||||||
|
"--quiet",
|
||||||
|
*ytdlp._cookie_args(),
|
||||||
|
], timeout=120)
|
||||||
|
|
||||||
|
flat_entries = []
|
||||||
|
for line in stdout.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
info = json.loads(line)
|
||||||
|
yt_id = info.get("id")
|
||||||
|
if yt_id:
|
||||||
|
flat_entries.append({
|
||||||
|
"id": yt_id,
|
||||||
|
"title": info.get("title", ""),
|
||||||
|
"duration": info.get("duration"),
|
||||||
|
})
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Store any new videos from the flat crawl
|
||||||
|
if flat_entries:
|
||||||
|
db = SessionLocal()
|
||||||
|
try:
|
||||||
|
channel = db.query(Channel).filter_by(id=channel_id).first()
|
||||||
|
if channel:
|
||||||
|
for entry in flat_entries:
|
||||||
|
if not db.query(Video).filter_by(youtube_video_id=entry["id"]).first():
|
||||||
|
try:
|
||||||
|
db.add(Video(
|
||||||
|
youtube_video_id=entry["id"],
|
||||||
|
channel_id=channel_id,
|
||||||
|
title=entry["title"],
|
||||||
|
thumbnail_url=ytdlp._stable_thumbnail(entry["id"]),
|
||||||
|
duration_seconds=entry["duration"],
|
||||||
|
tags="[]",
|
||||||
|
))
|
||||||
|
db.commit()
|
||||||
|
except Exception:
|
||||||
|
db.rollback()
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
# Phase 2 — individual fetches for view_count, prioritising missing ones
|
||||||
db = SessionLocal()
|
db = SessionLocal()
|
||||||
try:
|
try:
|
||||||
# Videos without view_count first, then those with stale counts
|
|
||||||
rows = db.execute(
|
rows = db.execute(
|
||||||
text("""
|
text("""
|
||||||
SELECT youtube_video_id FROM videos
|
SELECT youtube_video_id FROM videos
|
||||||
|
|||||||
Reference in New Issue
Block a user