Fix popular fetch and improve date/view_count coverage
Popular fetch now does a two-phase approach: fast flat-playlist to get IDs in popularity order, then parallel full metadata fetch (8 workers) to get real view_count and published_at for each video. Previously flat-playlist mode returned timestamp/view_count as null. Enrich task now also backfills published_at and view_count (not just description). Startup limit 3→50, enrichment sleep 2s→0.5s. Raise all thread pool sizes to match 8-core machine: - Discovery search: 5→8 workers - Graph signal: 4→8 workers - Popular fetch: 5→8 workers - Download semaphore default 3→6, cap 10→16 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -168,7 +168,7 @@ def on_startup():
|
|||||||
# Backfill descriptions for videos that don't have them yet (runs in background)
|
# Backfill descriptions for videos that don't have them yet (runs in background)
|
||||||
import threading
|
import threading
|
||||||
from .routers.channels import _enrich_missing_task, _index_channels_batch
|
from .routers.channels import _enrich_missing_task, _index_channels_batch
|
||||||
threading.Thread(target=_enrich_missing_task, args=(3,), daemon=True).start()
|
threading.Thread(target=_enrich_missing_task, args=(50,), daemon=True).start()
|
||||||
|
|
||||||
def _auto_sync_daemon():
|
def _auto_sync_daemon():
|
||||||
import time
|
import time
|
||||||
|
|||||||
@@ -185,16 +185,16 @@ def _discovery_task(user_id: int):
|
|||||||
|
|
||||||
|
|
||||||
def _enrich_missing_task(limit: int = 20):
|
def _enrich_missing_task(limit: int = 20):
|
||||||
"""Fetch full metadata for videos that are missing a description."""
|
"""Fetch full metadata for videos missing description, published_at, or view_count."""
|
||||||
from ..database import SessionLocal
|
from ..database import SessionLocal
|
||||||
|
import time
|
||||||
db = SessionLocal()
|
db = SessionLocal()
|
||||||
try:
|
try:
|
||||||
rows = db.execute(
|
rows = db.execute(
|
||||||
text("""
|
text("""
|
||||||
SELECT v.id, v.youtube_video_id FROM videos v
|
SELECT v.id, v.youtube_video_id FROM videos v
|
||||||
WHERE v.description IS NULL
|
WHERE v.description IS NULL OR v.published_at IS NULL OR v.view_count IS NULL
|
||||||
ORDER BY
|
ORDER BY
|
||||||
-- prioritise: followed-channel videos first, then discovery queue, then rest
|
|
||||||
(EXISTS (SELECT 1 FROM user_channels uc
|
(EXISTS (SELECT 1 FROM user_channels uc
|
||||||
WHERE uc.channel_id = v.channel_id AND uc.status = 'followed')) DESC,
|
WHERE uc.channel_id = v.channel_id AND uc.status = 'followed')) DESC,
|
||||||
(EXISTS (SELECT 1 FROM discovery_queue dq
|
(EXISTS (SELECT 1 FROM discovery_queue dq
|
||||||
@@ -206,7 +206,7 @@ def _enrich_missing_task(limit: int = 20):
|
|||||||
).mappings().all()
|
).mappings().all()
|
||||||
for i, row in enumerate(rows):
|
for i, row in enumerate(rows):
|
||||||
if i > 0:
|
if i > 0:
|
||||||
import time; time.sleep(2)
|
time.sleep(0.5)
|
||||||
try:
|
try:
|
||||||
meta = ytdlp.fetch_video_metadata(row["youtube_video_id"])
|
meta = ytdlp.fetch_video_metadata(row["youtube_video_id"])
|
||||||
if meta:
|
if meta:
|
||||||
@@ -214,6 +214,10 @@ def _enrich_missing_task(limit: int = 20):
|
|||||||
if vid:
|
if vid:
|
||||||
if meta.get("description") is not None:
|
if meta.get("description") is not None:
|
||||||
vid.description = meta["description"] or ""
|
vid.description = meta["description"] or ""
|
||||||
|
if not vid.published_at and meta.get("published_at"):
|
||||||
|
vid.published_at = meta["published_at"]
|
||||||
|
if vid.view_count is None and meta.get("view_count") is not None:
|
||||||
|
vid.view_count = meta["view_count"]
|
||||||
if not vid.tags and meta.get("tags"):
|
if not vid.tags and meta.get("tags"):
|
||||||
vid.tags = meta["tags"]
|
vid.tags = meta["tags"]
|
||||||
if not vid.category and meta.get("category"):
|
if not vid.category and meta.get("category"):
|
||||||
@@ -277,7 +281,7 @@ def sync_all_channels(
|
|||||||
background_tasks.add_task(_index_channels_batch, ids, current_user.id)
|
background_tasks.add_task(_index_channels_batch, ids, current_user.id)
|
||||||
background_tasks.add_task(_discovery_task, current_user.id)
|
background_tasks.add_task(_discovery_task, current_user.id)
|
||||||
|
|
||||||
background_tasks.add_task(_enrich_missing_task, 5)
|
background_tasks.add_task(_enrich_missing_task, 30)
|
||||||
|
|
||||||
return {"indexing": len(channels)}
|
return {"indexing": len(channels)}
|
||||||
|
|
||||||
@@ -651,59 +655,81 @@ def fetch_popular_videos(
|
|||||||
|
|
||||||
|
|
||||||
def _fetch_popular_task(channel_id: int, youtube_channel_id: str):
|
def _fetch_popular_task(channel_id: int, youtube_channel_id: str):
|
||||||
|
"""Two-phase popular fetch: get IDs fast via flat-playlist, then enrich with full metadata in parallel."""
|
||||||
from ..database import SessionLocal
|
from ..database import SessionLocal
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
if youtube_channel_id.startswith("@"):
|
||||||
|
url = f"https://www.youtube.com/{youtube_channel_id}/videos?sort=p"
|
||||||
|
else:
|
||||||
|
url = f"https://www.youtube.com/channel/{youtube_channel_id}/videos?sort=p"
|
||||||
|
|
||||||
|
# Phase 1: get ordered list of popular video IDs (fast)
|
||||||
|
stdout, _, _ = ytdlp._run([
|
||||||
|
"yt-dlp", url,
|
||||||
|
"--dump-json", "--flat-playlist",
|
||||||
|
"--playlist-end", "30",
|
||||||
|
"--quiet",
|
||||||
|
*ytdlp._cookie_args(),
|
||||||
|
], timeout=60)
|
||||||
|
|
||||||
|
video_ids = []
|
||||||
|
for line in stdout.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
info = json.loads(line)
|
||||||
|
yt_id = info.get("id")
|
||||||
|
if yt_id:
|
||||||
|
video_ids.append(yt_id)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not video_ids:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Phase 2: fetch full metadata in parallel (gets view_count + published_at)
|
||||||
|
with ThreadPoolExecutor(max_workers=8) as pool:
|
||||||
|
futures = {pool.submit(ytdlp.fetch_video_metadata, vid): vid for vid in video_ids}
|
||||||
|
results = {}
|
||||||
|
for future in as_completed(futures):
|
||||||
|
vid = futures[future]
|
||||||
|
try:
|
||||||
|
results[vid] = future.result()
|
||||||
|
except Exception:
|
||||||
|
results[vid] = None
|
||||||
|
|
||||||
db = SessionLocal()
|
db = SessionLocal()
|
||||||
try:
|
try:
|
||||||
if youtube_channel_id.startswith("@"):
|
|
||||||
url = f"https://www.youtube.com/{youtube_channel_id}/videos?sort=p"
|
|
||||||
else:
|
|
||||||
url = f"https://www.youtube.com/channel/{youtube_channel_id}/videos?sort=p"
|
|
||||||
|
|
||||||
stdout, _, code = ytdlp._run([
|
|
||||||
"yt-dlp", url,
|
|
||||||
"--dump-json", "--flat-playlist",
|
|
||||||
"--playlist-end", "100",
|
|
||||||
"--quiet",
|
|
||||||
*ytdlp._cookie_args(),
|
|
||||||
], timeout=120)
|
|
||||||
|
|
||||||
channel = db.query(Channel).filter_by(id=channel_id).first()
|
channel = db.query(Channel).filter_by(id=channel_id).first()
|
||||||
if not channel:
|
if not channel:
|
||||||
return
|
return
|
||||||
|
for yt_id in video_ids:
|
||||||
for line in stdout.splitlines():
|
meta = results.get(yt_id)
|
||||||
line = line.strip()
|
if not meta:
|
||||||
if not line:
|
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
info = json.loads(line)
|
existing = db.query(Video).filter_by(youtube_video_id=yt_id).first()
|
||||||
except json.JSONDecodeError:
|
if existing:
|
||||||
continue
|
if meta.get("view_count") is not None:
|
||||||
yt_id = info.get("id")
|
existing.view_count = meta["view_count"]
|
||||||
if not yt_id:
|
if meta.get("published_at") and not existing.published_at:
|
||||||
continue
|
existing.published_at = meta["published_at"]
|
||||||
existing = db.query(Video).filter_by(youtube_video_id=yt_id).first()
|
else:
|
||||||
view_count = info.get("view_count")
|
db.add(Video(
|
||||||
published_at = ytdlp._parse_published(info)
|
youtube_video_id=yt_id,
|
||||||
if existing:
|
channel_id=channel.id,
|
||||||
if view_count is not None:
|
title=meta.get("title", ""),
|
||||||
existing.view_count = view_count
|
thumbnail_url=ytdlp._stable_thumbnail(yt_id),
|
||||||
if published_at and not existing.published_at:
|
duration_seconds=meta.get("duration_seconds"),
|
||||||
existing.published_at = published_at
|
published_at=meta.get("published_at"),
|
||||||
else:
|
tags=meta.get("tags") or "[]",
|
||||||
db.add(Video(
|
view_count=meta.get("view_count"),
|
||||||
youtube_video_id=yt_id,
|
))
|
||||||
channel_id=channel.id,
|
db.commit()
|
||||||
title=info.get("title", ""),
|
except Exception:
|
||||||
thumbnail_url=ytdlp._stable_thumbnail(yt_id),
|
db.rollback()
|
||||||
duration_seconds=info.get("duration"),
|
|
||||||
published_at=published_at,
|
|
||||||
tags=json.dumps(info.get("tags") or []),
|
|
||||||
view_count=view_count,
|
|
||||||
))
|
|
||||||
db.commit()
|
|
||||||
except Exception:
|
|
||||||
db.rollback()
|
|
||||||
finally:
|
finally:
|
||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
|
|||||||
@@ -112,7 +112,7 @@ def _search_and_store(
|
|||||||
except Exception:
|
except Exception:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=5) as pool:
|
with ThreadPoolExecutor(max_workers=8) as pool:
|
||||||
futures = {pool.submit(_do_search, q): q for q in queries}
|
futures = {pool.submit(_do_search, q): q for q in queries}
|
||||||
for fut in as_completed(futures):
|
for fut in as_completed(futures):
|
||||||
for video in fut.result():
|
for video in fut.result():
|
||||||
@@ -620,7 +620,7 @@ def update_graph_signal(db: Session, user_id: int):
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
featured_map: dict[str, list[str]] = {}
|
featured_map: dict[str, list[str]] = {}
|
||||||
with ThreadPoolExecutor(max_workers=4) as pool:
|
with ThreadPoolExecutor(max_workers=8) as pool:
|
||||||
futures = {pool.submit(_fetch, row["youtube_channel_id"]): row for row in sample}
|
futures = {pool.submit(_fetch, row["youtube_channel_id"]): row for row in sample}
|
||||||
for fut in as_completed(futures):
|
for fut in as_completed(futures):
|
||||||
row = futures[fut]
|
row = futures[fut]
|
||||||
|
|||||||
@@ -665,7 +665,7 @@ def predicted_file_path(video_id: str) -> Path:
|
|||||||
return Path(settings.download_path) / f"{video_id}.mp4"
|
return Path(settings.download_path) / f"{video_id}.mp4"
|
||||||
|
|
||||||
|
|
||||||
_SEMAPHORE = threading.Semaphore(3)
|
_SEMAPHORE = threading.Semaphore(6)
|
||||||
_semaphore_lock = threading.Lock()
|
_semaphore_lock = threading.Lock()
|
||||||
_cookies_browser: str = ""
|
_cookies_browser: str = ""
|
||||||
_cookies_file: str = ""
|
_cookies_file: str = ""
|
||||||
@@ -682,7 +682,7 @@ _oauth2_state_lock = threading.Lock()
|
|||||||
def set_max_concurrent(n: int) -> None:
|
def set_max_concurrent(n: int) -> None:
|
||||||
global _SEMAPHORE
|
global _SEMAPHORE
|
||||||
with _semaphore_lock:
|
with _semaphore_lock:
|
||||||
_SEMAPHORE = threading.Semaphore(max(1, min(n, 10)))
|
_SEMAPHORE = threading.Semaphore(max(1, min(n, 16)))
|
||||||
|
|
||||||
|
|
||||||
def set_cookies_browser(browser: str) -> None:
|
def set_cookies_browser(browser: str) -> None:
|
||||||
|
|||||||
@@ -117,7 +117,7 @@ export default function ChannelPage() {
|
|||||||
|
|
||||||
const popularMut = useMutation({
|
const popularMut = useMutation({
|
||||||
mutationFn: () => fetchPopularVideos(id),
|
mutationFn: () => fetchPopularVideos(id),
|
||||||
onSuccess: () => scheduleRefetch(20000),
|
onSuccess: () => scheduleRefetch(35000),
|
||||||
});
|
});
|
||||||
|
|
||||||
const deepSearchMut = useMutation({
|
const deepSearchMut = useMutation({
|
||||||
|
|||||||
Reference in New Issue
Block a user