vps-management-bot/scripts/tmdb_emby_sync.py

#!/usr/bin/env python3 -u
"""
Batch update Emby episode overviews from TMDB Chinese metadata.
Run with: python3 -u scripts/tmdb_emby_sync.py
"""

import json
import re
import sys
import time
import urllib.request
import urllib.parse
import urllib.error
from html import unescape

EMBY_URL = "http://145.239.143.92:8096"
API_KEY = "e3e52b1dcb8b47c39d46b5256bf87081"
ADMIN_UID = "0f026d40c1e04bb7a099aab75a501614"

SERIES = [
    ("小猪佩奇", "13", 12225),
    ("安全警长啦咘啦哆", "18", 219799),
    ("动物神探队", "11", 195407),
    ("啦咘啦哆警长大战羚羚羊", "12", 253041),
    ("布鲁伊", "1548", 82728),
    ("汪汪队立大功", "14", 57532),
    ("小恐龙大冒险", "1547", 82027),
    ("海底小纵队", "16", 37472),
    ("海底小纵队：中国之旅", "17", 132983),
    ("小马宝莉：友谊大魔法", "15", 20085),
]


def api_get(path, params=None):
    if params is None:
        params = {}
    params["api_key"] = API_KEY
    url = f"{EMBY_URL}{path}?{urllib.parse.urlencode(params)}"
    with urllib.request.urlopen(url, timeout=30) as resp:
        return json.loads(resp.read())


def api_post(path, data, params=None):
    if params is None:
        params = {}
    params["api_key"] = API_KEY
    url = f"{EMBY_URL}{path}?{urllib.parse.urlencode(params)}"
    body = json.dumps(data).encode("utf-8")
    req = urllib.request.Request(url, data=body, method="POST")
    req.add_header("Content-Type", "application/json")
    with urllib.request.urlopen(req, timeout=30) as resp:
        return resp.status


def fetch_all_episodes():
    """Fetch all episodes once, return dict keyed by SeriesId."""
    from collections import defaultdict
    by_series = defaultdict(list)
    start = 0
    while True:
        data = api_get("/Items", {
            "Recursive": "true",
            "IncludeItemTypes": "Episode",
            "Fields": "Overview,ParentIndexNumber,IndexNumber,SeriesId,SeriesName",
            "StartIndex": str(start),
            "Limit": "200",
        })
        for item in data["Items"]:
            by_series[str(item.get("SeriesId", ""))].append(item)
        start += len(data["Items"])
        if start >= data["TotalRecordCount"]:
            break
    return by_series

_all_eps = None
def fetch_emby_episodes(series_id):
    global _all_eps
    if _all_eps is None:
        _all_eps = fetch_all_episodes()
    return _all_eps.get(str(series_id), [])


def fetch_and_parse_tmdb_season(tmdb_id, season_num):
    """Fetch TMDB season page and parse episode overviews."""
    url = f"https://www.themoviedb.org/tv/{tmdb_id}/season/{season_num}?language=zh-CN"
    req = urllib.request.Request(url, headers={
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Accept-Language": "zh-CN,zh;q=0.9",
    })
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            html = resp.read().decode("utf-8")
    except urllib.error.HTTPError as e:
        if e.code == 404:
            return {}
        raise

    episodes = {}
    # Phrases that indicate no real content on TMDB
    PLACEHOLDER_PHRASES = [
        "暂无英文版的简介",
        "We don't have an overview",
        "No overview",
        "请添加内容帮助我们完善数据库",
    ]

    cards = re.split(r'<div class="card"', html)
    for card in cards[1:]:
        ep_match = re.search(r'data-episode-number="(\d+)"', card)
        if not ep_match:
            continue
        ep_num = int(ep_match.group(1))
        ov_match = re.search(
            r'<div class="overview">\s*<p>(.*?)</p>', card, re.DOTALL
        )
        if ov_match:
            overview = re.sub(r'<[^>]+>', '', ov_match.group(1)).strip()
            overview = unescape(overview).strip()
            # Skip placeholder/empty overviews
            if len(overview) < 5:
                continue
            if any(ph in overview for ph in PLACEHOLDER_PHRASES):
                continue
            episodes[ep_num] = overview
    return episodes


def needs_update(ep):
    ov = ep.get("Overview", "")
    return not ov or len(ov.strip()) < 5


def process_series(series_name, series_id, tmdb_id):
    print(f"\n{'='*50}", flush=True)
    print(f"{series_name} (Emby:{series_id} TMDB:{tmdb_id})", flush=True)

    emby_eps = fetch_emby_episodes(series_id)
    missing = [e for e in emby_eps if needs_update(e)]
    print(f"Total: {len(emby_eps)}, Missing: {len(missing)}", flush=True)

    if not missing:
        print("Nothing to update.", flush=True)
        return 0

    seasons = sorted(set(e.get("ParentIndexNumber", 0) for e in missing))
    print(f"Seasons: {seasons}", flush=True)

    updated = 0
    no_tmdb = 0

    for sn in seasons:
        print(f"  S{sn:02d}: fetching TMDB...", end=" ", flush=True)
        try:
            tmdb_eps = fetch_and_parse_tmdb_season(tmdb_id, sn)
            print(f"{len(tmdb_eps)} eps found", flush=True)
        except Exception as e:
            print(f"ERROR: {e}", flush=True)
            continue

        time.sleep(1.5)

        season_missing = [
            e for e in missing if e.get("ParentIndexNumber") == sn
        ]

        for ep in season_missing:
            ep_num = ep.get("IndexNumber")
            if ep_num is None:
                continue

            overview = tmdb_eps.get(ep_num)
            if not overview:
                no_tmdb += 1
                continue

            try:
                item = api_get(f"/Users/{ADMIN_UID}/Items/{ep['Id']}")
                item["Overview"] = overview
                api_post(f"/Items/{ep['Id']}", item)
                updated += 1
                print(f"    ✓ E{ep_num:02d}: {overview[:50]}", flush=True)
            except Exception as e:
                print(f"    ✗ E{ep_num:02d}: {e}", flush=True)

            time.sleep(0.2)

    print(f"  Done: {updated} updated, {no_tmdb} no TMDB data", flush=True)
    return updated


def main():
    target = sys.argv[1] if len(sys.argv) > 1 else None
    total = 0
    for name, sid, tid in SERIES:
        if target and target not in name:
            continue
        total += process_series(name, sid, tid)
    print(f"\n{'='*50}", flush=True)
    print(f"TOTAL UPDATED: {total}", flush=True)


if __name__ == "__main__":
    main()