Fix popular fetch and improve date/view_count coverage

Popular fetch now does a two-phase approach: fast flat-playlist to get
IDs in popularity order, then parallel full metadata fetch (8 workers)
to get real view_count and published_at for each video. Previously
flat-playlist mode returned timestamp/view_count as null.

Enrich task now also backfills published_at and view_count (not just
description). Startup limit 3→50, enrichment sleep 2s→0.5s.

Raise all thread pool sizes to match 8-core machine:
- Discovery search: 5→8 workers
- Graph signal: 4→8 workers
- Popular fetch: 5→8 workers
- Download semaphore default 3→6, cap 10→16

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-26 22:36:18 +02:00
parent 5b0cf27f07
commit 2f37072187
5 changed files with 82 additions and 56 deletions

View File

@@ -168,7 +168,7 @@ def on_startup():
# Backfill descriptions for videos that don't have them yet (runs in background)
import threading
from .routers.channels import _enrich_missing_task, _index_channels_batch
threading.Thread(target=_enrich_missing_task, args=(3,), daemon=True).start()
threading.Thread(target=_enrich_missing_task, args=(50,), daemon=True).start()
def _auto_sync_daemon():
import time

View File

@@ -185,16 +185,16 @@ def _discovery_task(user_id: int):
def _enrich_missing_task(limit: int = 20):
"""Fetch full metadata for videos that are missing a description."""
"""Fetch full metadata for videos missing description, published_at, or view_count."""
from ..database import SessionLocal
import time
db = SessionLocal()
try:
rows = db.execute(
text("""
SELECT v.id, v.youtube_video_id FROM videos v
WHERE v.description IS NULL
WHERE v.description IS NULL OR v.published_at IS NULL OR v.view_count IS NULL
ORDER BY
-- prioritise: followed-channel videos first, then discovery queue, then rest
(EXISTS (SELECT 1 FROM user_channels uc
WHERE uc.channel_id = v.channel_id AND uc.status = 'followed')) DESC,
(EXISTS (SELECT 1 FROM discovery_queue dq
@@ -206,7 +206,7 @@ def _enrich_missing_task(limit: int = 20):
).mappings().all()
for i, row in enumerate(rows):
if i > 0:
import time; time.sleep(2)
time.sleep(0.5)
try:
meta = ytdlp.fetch_video_metadata(row["youtube_video_id"])
if meta:
@@ -214,6 +214,10 @@ def _enrich_missing_task(limit: int = 20):
if vid:
if meta.get("description") is not None:
vid.description = meta["description"] or ""
if not vid.published_at and meta.get("published_at"):
vid.published_at = meta["published_at"]
if vid.view_count is None and meta.get("view_count") is not None:
vid.view_count = meta["view_count"]
if not vid.tags and meta.get("tags"):
vid.tags = meta["tags"]
if not vid.category and meta.get("category"):
@@ -277,7 +281,7 @@ def sync_all_channels(
background_tasks.add_task(_index_channels_batch, ids, current_user.id)
background_tasks.add_task(_discovery_task, current_user.id)
background_tasks.add_task(_enrich_missing_task, 5)
background_tasks.add_task(_enrich_missing_task, 30)
return {"indexing": len(channels)}
@@ -651,59 +655,81 @@ def fetch_popular_videos(
def _fetch_popular_task(channel_id: int, youtube_channel_id: str):
"""Two-phase popular fetch: get IDs fast via flat-playlist, then enrich with full metadata in parallel."""
from ..database import SessionLocal
from concurrent.futures import ThreadPoolExecutor, as_completed
if youtube_channel_id.startswith("@"):
url = f"https://www.youtube.com/{youtube_channel_id}/videos?sort=p"
else:
url = f"https://www.youtube.com/channel/{youtube_channel_id}/videos?sort=p"
# Phase 1: get ordered list of popular video IDs (fast)
stdout, _, _ = ytdlp._run([
"yt-dlp", url,
"--dump-json", "--flat-playlist",
"--playlist-end", "30",
"--quiet",
*ytdlp._cookie_args(),
], timeout=60)
video_ids = []
for line in stdout.splitlines():
line = line.strip()
if not line:
continue
try:
info = json.loads(line)
yt_id = info.get("id")
if yt_id:
video_ids.append(yt_id)
except json.JSONDecodeError:
continue
if not video_ids:
return
# Phase 2: fetch full metadata in parallel (gets view_count + published_at)
with ThreadPoolExecutor(max_workers=8) as pool:
futures = {pool.submit(ytdlp.fetch_video_metadata, vid): vid for vid in video_ids}
results = {}
for future in as_completed(futures):
vid = futures[future]
try:
results[vid] = future.result()
except Exception:
results[vid] = None
db = SessionLocal()
try:
if youtube_channel_id.startswith("@"):
url = f"https://www.youtube.com/{youtube_channel_id}/videos?sort=p"
else:
url = f"https://www.youtube.com/channel/{youtube_channel_id}/videos?sort=p"
stdout, _, code = ytdlp._run([
"yt-dlp", url,
"--dump-json", "--flat-playlist",
"--playlist-end", "100",
"--quiet",
*ytdlp._cookie_args(),
], timeout=120)
channel = db.query(Channel).filter_by(id=channel_id).first()
if not channel:
return
for line in stdout.splitlines():
line = line.strip()
if not line:
for yt_id in video_ids:
meta = results.get(yt_id)
if not meta:
continue
try:
info = json.loads(line)
except json.JSONDecodeError:
continue
yt_id = info.get("id")
if not yt_id:
continue
existing = db.query(Video).filter_by(youtube_video_id=yt_id).first()
view_count = info.get("view_count")
published_at = ytdlp._parse_published(info)
if existing:
if view_count is not None:
existing.view_count = view_count
if published_at and not existing.published_at:
existing.published_at = published_at
else:
db.add(Video(
youtube_video_id=yt_id,
channel_id=channel.id,
title=info.get("title", ""),
thumbnail_url=ytdlp._stable_thumbnail(yt_id),
duration_seconds=info.get("duration"),
published_at=published_at,
tags=json.dumps(info.get("tags") or []),
view_count=view_count,
))
db.commit()
except Exception:
db.rollback()
existing = db.query(Video).filter_by(youtube_video_id=yt_id).first()
if existing:
if meta.get("view_count") is not None:
existing.view_count = meta["view_count"]
if meta.get("published_at") and not existing.published_at:
existing.published_at = meta["published_at"]
else:
db.add(Video(
youtube_video_id=yt_id,
channel_id=channel.id,
title=meta.get("title", ""),
thumbnail_url=ytdlp._stable_thumbnail(yt_id),
duration_seconds=meta.get("duration_seconds"),
published_at=meta.get("published_at"),
tags=meta.get("tags") or "[]",
view_count=meta.get("view_count"),
))
db.commit()
except Exception:
db.rollback()
finally:
db.close()

View File

@@ -112,7 +112,7 @@ def _search_and_store(
except Exception:
return []
with ThreadPoolExecutor(max_workers=5) as pool:
with ThreadPoolExecutor(max_workers=8) as pool:
futures = {pool.submit(_do_search, q): q for q in queries}
for fut in as_completed(futures):
for video in fut.result():
@@ -620,7 +620,7 @@ def update_graph_signal(db: Session, user_id: int):
return []
featured_map: dict[str, list[str]] = {}
with ThreadPoolExecutor(max_workers=4) as pool:
with ThreadPoolExecutor(max_workers=8) as pool:
futures = {pool.submit(_fetch, row["youtube_channel_id"]): row for row in sample}
for fut in as_completed(futures):
row = futures[fut]

View File

@@ -665,7 +665,7 @@ def predicted_file_path(video_id: str) -> Path:
return Path(settings.download_path) / f"{video_id}.mp4"
_SEMAPHORE = threading.Semaphore(3)
_SEMAPHORE = threading.Semaphore(6)
_semaphore_lock = threading.Lock()
_cookies_browser: str = ""
_cookies_file: str = ""
@@ -682,7 +682,7 @@ _oauth2_state_lock = threading.Lock()
def set_max_concurrent(n: int) -> None:
global _SEMAPHORE
with _semaphore_lock:
_SEMAPHORE = threading.Semaphore(max(1, min(n, 10)))
_SEMAPHORE = threading.Semaphore(max(1, min(n, 16)))
def set_cookies_browser(browser: str) -> None:

View File

@@ -117,7 +117,7 @@ export default function ChannelPage() {
const popularMut = useMutation({
mutationFn: () => fetchPopularVideos(id),
onSuccess: () => scheduleRefetch(20000),
onSuccess: () => scheduleRefetch(35000),
});
const deepSearchMut = useMutation({