diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..7715dcb --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,23 @@ +# Agent Review Instructions + +You are reviewing a Python + ffmpeg video editing tool. + +Main goal: +Build a reliable SRT-driven video editor for Chinese drama recap videos. + +Please focus on: +- code structure +- ffmpeg stability +- SRT parsing correctness +- JSON validation +- Windows path compatibility +- Chinese subtitle rendering +- error handling +- extensibility + +Do not rewrite the entire project unless necessary. +Prefer small, safe patches. +Classify suggestions into: +1. Must fix +2. Should improve +3. Later diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..61b1e4c --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,33 @@ +# srt_video_editor 项目说明 + +本项目是电视剧解说自动剪辑工具。 + +核心目标: +根据 script.srt 和 edit_plan.json,从 source.mp4 中截取画面,拼接、加配音、烧字幕,输出 final.mp4。 + +当前阶段: +只做 SRT 驱动剪辑,不做 AI 自动理解剧情。 + +技术要求: +- Python 3.10+ +- ffmpeg +- Windows 优先 +- 路径尽量使用英文 +- 不使用 moviepy,优先直接调用 ffmpeg +- 输出日志要清楚 +- 不要引入复杂前端 + +核心输入: +- input/source.mp4 +- input/script.srt +- input/edit_plan.json +- input/voice.wav + +核心输出: +- output/final.mp4 + +禁止事项: +- 不要破解剪映 +- 不要调用未授权接口 +- 不要一次性做复杂 AI 自动分析 +- 不要改动 input 原始文件 diff --git a/examples/srt_driven/_smoke_test.py b/examples/srt_driven/_smoke_test.py new file mode 100644 index 0000000..88053e2 --- /dev/null +++ b/examples/srt_driven/_smoke_test.py @@ -0,0 +1,421 @@ +"""Regression tests for srt_driven_edit. Run with bare `python` — no pytest.""" + +import sys +import tempfile +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "helpers")) + +from srt_driven_edit import ( + parse_srt, parse_plan, align, validate_srt, validate_plan, + validate_alignment, resolve_style, has_cjk, STYLE_TEMPLATES, + subs_filter_escape, safe_ascii_name, + concat_quote_path, read_srt_text, make_safe_work_dir, + _split_time_line, V_SYNC_TAIL, A_SYNC_TAIL, SRT_ENCODINGS, + ensure_safe_subs_path, _path_is_filter_safe, + PARAMS_FINGERPRINT, CACHE_VERSION, cache_key, + Segment, +) + +base = Path(__file__).resolve().parent + + +def section(title: str) -> None: + print(f"\n=== {title} ===") + + +def ok(msg: str) -> None: + print(f" ok: {msg}") + + +def fail(msg: str) -> None: + raise SystemExit(f" FAIL: {msg}") + + +# -- 1. Legacy Form A + Form B parsing ----------------------------------- + +section("Form A (legacy array, English)") +cues = parse_srt(base / "script.srt") +validate_srt(cues) +sources, voices, entries = parse_plan(base / "edit_plan.json") +assert len(cues) == 3 and len(entries) == 3 and sources == {} and voices == {} +ok("parsed 3 cues + 3 plan entries, no sources/voices map") +assert not has_cjk(cues) +ok("has_cjk False on English") + +section("Form B (object, multi-source, multi-voice)") +sources, voices, entries = parse_plan(base / "edit_plan_v2.json") +assert list(sources) == ["A", "B"] and list(voices) == ["host", "guest"] +ok(f"sources={list(sources)} voices={list(voices)}") +assert entries[0].source_name == "A" and entries[0].voice_name == "host" +assert entries[1].source_name == "B" and entries[1].voice_name == "guest" +assert entries[2].source_name == "A" and entries[2].voice_name is None +ok("per-segment source/voice refs parsed") + + +# -- 2. CJK detection + auto style + style templates --------------------- + +section("CJK detection + style resolution") +cues_cjk = parse_srt(base / "script_cjk.srt") +assert has_cjk(cues_cjk) is True +assert not has_cjk(cues) +ok("CJK regex matches CN/EN correctly") +auto_cjk = resolve_style("auto", cues_cjk) +auto_en = resolve_style("auto", cues) +assert "Microsoft YaHei UI" in auto_cjk +assert "Helvetica" in auto_en +ok("auto style picks YaHei for CJK, Helvetica for EN") +assert STYLE_TEMPLATES["cjk-natural"] == resolve_style("cjk-natural", cues) +ok("named template lookup") +raw = "FontName=Custom,FontSize=24" +assert resolve_style(raw, cues) == raw +ok("raw ASS string passthrough") + + +# -- 3. SRT encoding fallback (GBK / utf-8-sig / utf-8) ------------------ + +section("read_srt_text encoding fallback") +tmp = Path(tempfile.mkdtemp(prefix="srt_smoke_")) + +cjk_payload = "1\n00:00:00,000 --> 00:00:03,000\n中文字幕测试\n" + +# utf-8 +(tmp / "u8.srt").write_bytes(cjk_payload.encode("utf-8")) +text = read_srt_text(tmp / "u8.srt") +assert "中文字幕测试" in text, f"utf-8 decode wrong: {text!r}" +ok("utf-8 decoded") + +# utf-8 with BOM +(tmp / "u8bom.srt").write_bytes(b"\xef\xbb\xbf" + cjk_payload.encode("utf-8")) +text = read_srt_text(tmp / "u8bom.srt") +assert text.startswith("1") and "中文" in text, f"utf-8-sig decode wrong: {text!r}" +ok("utf-8-sig BOM stripped + decoded") + +# gb18030 (typical Windows Chinese) +(tmp / "gb.srt").write_bytes(cjk_payload.encode("gb18030")) +text = read_srt_text(tmp / "gb.srt") +assert "中文字幕测试" in text, f"gb18030 decode wrong: {text!r}" +ok("gb18030 decoded via fallback") + +# cp936 (a.k.a. GBK, Windows Chinese ANSI) +(tmp / "cp936.srt").write_bytes(cjk_payload.encode("cp936")) +text = read_srt_text(tmp / "cp936.srt") +assert "中文字幕测试" in text +ok("cp936 decoded via fallback") + +# Now parse a GBK-encoded full SRT end-to-end +gbk_full = ( + "1\n00:00:00,000 --> 00:00:03,000\n这是第一条\n\n" + "2\n00:00:03,000 --> 00:00:06,000\n这是第二条\n" +) +gbk_path = tmp / "full_gbk.srt" +gbk_path.write_bytes(gbk_full.encode("gb18030")) +cues_gbk = parse_srt(gbk_path) +assert len(cues_gbk) == 2 +assert cues_gbk[0].text == "这是第一条" +assert cues_gbk[1].text == "这是第二条" +ok("parse_srt end-to-end on GB18030 input") + + +# -- 4. SRT cue settings tolerance --------------------------------------- + +section("Cue settings on time line") +# Real-world examples: 'position:90% align:start' on the right of --> +samples = [ + ("00:00:00,000 --> 00:00:03,000 position:90%", (0.0, 3.0)), + ("00:00:01,500 --> 00:00:04,200 align:start line:80%", (1.5, 4.2)), + (" 00:00:02,000 --> 00:00:05,000 X1:10 X2:200 Y1:5 Y2:50", (2.0, 5.0)), + ("00:00:00.500 --> 00:00:01.000", (0.5, 1.0)), # dot fraction +] +for line, expected in samples: + a, b = _split_time_line(line) + from srt_driven_edit import parse_timestamp + got = (parse_timestamp(a), parse_timestamp(b)) + assert abs(got[0] - expected[0]) < 1e-6 and abs(got[1] - expected[1]) < 1e-6, \ + f"{line!r} → {got}, expected {expected}" +ok(f"parsed {len(samples)} time lines with cue settings / odd spacing") + +# Full SRT with cue settings inline +weird_srt = ( + "1\n00:00:00,000 --> 00:00:03,000 position:90% align:start\nhello\n\n" + "2\n00:00:03,000 --> 00:00:07,000 line:80%\nworld\n" +) +weird = tmp / "weird.srt" +weird.write_text(weird_srt, encoding="utf-8") +parsed = parse_srt(weird) +assert len(parsed) == 2 +assert parsed[0].final_start == 0.0 and parsed[0].final_end == 3.0 +assert parsed[0].text == "hello" and parsed[1].text == "world" +ok("parse_srt tolerates cue settings end-to-end") + + +# -- 5. concat_quote_path edge cases ------------------------------------- + +section("concat_quote_path edge cases") +cases = [ + (Path("/tmp/foo.mp4"), "'/tmp/foo.mp4'"), + (Path("/tmp/foo bar.mp4"), "'/tmp/foo bar.mp4'"), + (Path("/tmp/it's.mp4"), "'/tmp/it'\\''s.mp4'"), + (Path("/tmp/he said 'hi'.mp4"), "'/tmp/he said '\\''hi'\\''.mp4'"), +] +for p, _expected in cases: + got = concat_quote_path(p) + # We only check the structural pattern: start/end with single quote, + # any embedded single-quotes are properly close-escape-reopened. + assert got.startswith("'") and got.endswith("'"), f"{p}: {got}" + # Verify reverse — closing+escape+reopen idiom for any input apostrophe + if "'" in p.as_posix(): + assert "'\\''" in got, f"{p}: {got}" + ok(f"{p.as_posix()!r:<35} → {got}") + +# CJK paths — verify it doesn't barf and produces a quoted UTF-8 string +# Note: concat_quote_path calls .resolve() which prepends a drive letter on +# Windows, so compare against the resolved posix form, not the literal input. +cjk_p = Path("/tmp/视频 v2/片段.mp4") +got = concat_quote_path(cjk_p) +assert got == f"'{cjk_p.resolve().as_posix()}'" +assert "视频" in got and "片段" in got +ok(f"CJK + space preserved: {got}") + + +# -- 6. make_safe_work_dir produces ASCII path --------------------------- + +section("make_safe_work_dir") +plan_with_cjk_path = tmp / "中文 plan.json" +plan_with_cjk_path.write_text("[]", encoding="utf-8") +wd = make_safe_work_dir("我的剪辑 v2!", plan_with_cjk_path) +assert wd.exists() and wd.is_dir() +# Path must be ASCII-only (no CJK leaks) +assert all(ord(c) < 128 for c in str(wd)), f"work dir not ASCII: {wd}" +assert "srt_edit_" in wd.name +ok(f"work dir is ASCII: {wd}") + +# Re-creating wipes previous contents (deterministic) +sentinel = wd / "_stale.txt" +sentinel.write_text("old") +wd2 = make_safe_work_dir("我的剪辑 v2!", plan_with_cjk_path) +assert wd2 == wd +assert not sentinel.exists() +ok("rerun wipes stale contents") + + +# -- 7. Sync tails defined and reasonable -------------------------------- + +section("Sync tail constants") +assert "fps=24" in V_SYNC_TAIL and "setpts=PTS-STARTPTS" in V_SYNC_TAIL +assert "aresample=async=1" in A_SYNC_TAIL and "asetpts=PTS-STARTPTS" in A_SYNC_TAIL +ok(f"V_SYNC_TAIL = {V_SYNC_TAIL}") +ok(f"A_SYNC_TAIL = {A_SYNC_TAIL}") + + +# -- 8. Strict validation ------------------------------------------------- + +section("Validation errors hard-fail") +import json as _j + +# duplicate id in SRT +bad = tmp / "dup.srt" +bad.write_text("1\n00:00:00,000 --> 00:00:01,000\na\n\n1\n00:00:01,000 --> 00:00:02,000\nb\n", encoding="utf-8") +try: + validate_srt(parse_srt(bad)) + fail("dup id should have errored") +except SystemExit as e: + ok(f"dup id: {e}") + +# overlap +bad.write_text("1\n00:00:00,000 --> 00:00:03,000\na\n\n2\n00:00:02,000 --> 00:00:04,000\nb\n", encoding="utf-8") +try: + validate_srt(parse_srt(bad)) + fail("overlap should have errored") +except SystemExit as e: + ok(f"overlap: {e}") + +# non-monotonic +bad.write_text("1\n00:00:05,000 --> 00:00:07,000\na\n\n2\n00:00:00,000 --> 00:00:02,000\nb\n", encoding="utf-8") +try: + validate_srt(parse_srt(bad)) + fail("non-monotonic should have errored") +except SystemExit as e: + ok(f"non-monotonic: {e}") + +# end <= start in plan +bad_plan = tmp / "bad_plan.json" +bad_plan.write_text(_j.dumps([{"id": 1, "source_start": "00:00:05,000", "source_end": "00:00:03,000"}]), encoding="utf-8") +try: + s, v, ents = parse_plan(bad_plan) + validate_plan(ents, s, v, Path("/fake/source.mp4")) + fail("end<=start should have errored") +except SystemExit as e: + ok(f"end<=start: {e}") + +# negative source_start +bad_plan.write_text(_j.dumps([{"id": 1, "source_start": "00:00:00,000", "source_end": "00:00:03,000"}]), encoding="utf-8") +s, v, ents = parse_plan(bad_plan) +ents[0].source_start = -1.0 +try: + validate_plan(ents, s, v, Path("/fake/source.mp4")) + fail("negative start should have errored") +except SystemExit as e: + ok(f"negative start: {e}") + +# id mismatch +ok_srt = parse_srt(base / "script.srt") +s, v, ents = parse_plan(base / "edit_plan.json") +from srt_driven_edit import PlanEntry +ents.append(PlanEntry(id=99, source_name="_default", source_start=0.0, source_end=1.0, voice_name=None)) +try: + validate_alignment(ok_srt, ents) + fail("id mismatch should have errored") +except SystemExit as e: + ok(f"id mismatch: {e}") + + +# -- 9. Alignment + gap handling on real example ------------------------- + +section("alignment on script.srt + edit_plan.json") +s, v, ents = parse_plan(base / "edit_plan.json") +segs = align(parse_srt(base / "script.srt"), ents, s, v, + legacy_default_source=Path("/fake/source.mp4"), + tolerance=0.5, trim_direction="tail", on_short="error") +for sg in segs: + print(f" id={sg.id} src[{sg.source_start:.3f}-{sg.source_end:.3f}] " + f"out[{sg.out_start:.3f}-{sg.out_end:.3f}] gap={sg.leading_gap:.3f}") +assert abs(segs[-1].out_end - 12.0) < 1e-6 +assert abs(segs[2].leading_gap - 1.5) < 1e-6 +ok("12.0s total, 1.5s gap before id=3") + + + +# -- 10. ensure_safe_subs_path self-defense ------------------------------ + +section("ensure_safe_subs_path") +# safe path (already ASCII, no single quote): returned as-is +safe_in = tmp / "plain.srt" +safe_in.write_text("1\n00:00:00,000 --> 00:00:01,000\nhi\n", encoding="utf-8") +out, cleanup = ensure_safe_subs_path(safe_in) +assert out == safe_in and cleanup is None +ok(f"ascii input returned as-is: {out.name}") + +# unsafe path: CJK in name → copied to safe location +cjk_in = tmp / "中文 字幕.srt" +cjk_in.write_text("1\n00:00:00,000 --> 00:00:01,000\nhi\n", encoding="utf-8") +out, cleanup = ensure_safe_subs_path(cjk_in) +assert out != cjk_in and cleanup == out +assert str(out).isascii(), f"safe copy still has non-ASCII chars: {out}" +assert out.read_text(encoding="utf-8").startswith("1") +ok(f"CJK input copied to safe path: {out}") +cleanup.unlink() + +# unsafe path: single quote in name → also copied +quote_in = tmp / "it's mine.srt" +quote_in.write_text("1\n00:00:00,000 --> 00:00:01,000\nhi\n", encoding="utf-8") +out, cleanup = ensure_safe_subs_path(quote_in) +assert out != quote_in and "'" not in str(out) +ok(f"single-quote input copied to safe path: {out}") +cleanup.unlink() + +# unsafe + non-UTF-8 input gets normalized through read_srt_text +gbk_in = tmp / "gbk 字幕.srt" +gbk_in.write_bytes("1\n00:00:00,000 --> 00:00:01,000\n中文\n".encode("gb18030")) +out, cleanup = ensure_safe_subs_path(gbk_in) +assert "中文" in out.read_text(encoding="utf-8") +ok(f"GB18030 + CJK path → normalized utf-8 safe copy") +cleanup.unlink() + +# _path_is_filter_safe sanity +assert _path_is_filter_safe(Path("/tmp/foo.srt")) is True +assert _path_is_filter_safe(Path("/tmp/视频.srt")) is False +assert _path_is_filter_safe(Path("/tmp/it's.srt")) is False +ok("_path_is_filter_safe correctly flags non-ASCII and single quote") + + +# -- 11. Cache key fingerprinting --------------------------------------- + +section("cache_key includes params fingerprint + ffmpeg version") +assert isinstance(PARAMS_FINGERPRINT, str) and len(PARAMS_FINGERPRINT) == 10 +ok(f"PARAMS_FINGERPRINT = {PARAMS_FINGERPRINT}") +assert CACHE_VERSION == 2 +ok(f"CACHE_VERSION bumped to {CACHE_VERSION}") + +# Build a fake segment pointed at a real file (this script) so _file_fingerprint works +fake_seg = Segment( + id=1, + source_path=Path(__file__).resolve(), + source_start=0.0, + source_end=1.0, + out_start=0.0, + out_end=1.0, + leading_gap=0.0, + text="x", + voice_path=None, + pad_short=False, + plan_src_dur=1.0, +) +k_v60 = cache_key(fake_seg, effective_bg_volume=0.0, hdr=False, portrait=False, + voice_signature=None, ffmpeg_version="6.0") +k_v71 = cache_key(fake_seg, effective_bg_volume=0.0, hdr=False, portrait=False, + voice_signature=None, ffmpeg_version="7.1") +assert k_v60 != k_v71, "different ffmpeg versions should produce different cache keys" +ok(f"ffmpeg 6.0 → {k_v60[:16]}…, 7.1 → {k_v71[:16]}… (differ)") + +k_bg0 = cache_key(fake_seg, effective_bg_volume=0.0, hdr=False, portrait=False, + voice_signature=None, ffmpeg_version="6.0") +k_bg1 = cache_key(fake_seg, effective_bg_volume=0.1, hdr=False, portrait=False, + voice_signature=None, ffmpeg_version="6.0") +assert k_bg0 != k_bg1, "different effective bg_volume must invalidate cache" +ok("effective bg_volume differs → cache key differs") + + +# -- 12. preflight + probe_streams (best-effort, ffmpeg may be absent) --- + +section("preflight + probe_streams (only if ffmpeg installed)") +import shutil as _sh +import subprocess as _sp +if _sh.which("ffmpeg") and _sh.which("ffprobe"): + from srt_driven_edit import preflight, probe_streams + versions = preflight() + assert "ffmpeg" in versions and "ffprobe" in versions + ok(f"preflight ok: {versions}") + + # Build a 0.5s test mp4 with video + audio via lavfi + av_mp4 = tmp / "probe_av.mp4" + _sp.run([ + "ffmpeg", "-y", "-hide_banner", "-loglevel", "error", + "-f", "lavfi", "-i", "color=c=red:s=320x240:r=24:d=0.5", + "-f", "lavfi", "-i", "anullsrc=channel_layout=stereo:sample_rate=48000", + "-t", "0.5", + "-c:v", "libx264", "-pix_fmt", "yuv420p", + "-c:a", "aac", + str(av_mp4), + ], check=True) + info = probe_streams(av_mp4) + assert info["has_video"] is True and info["has_audio"] is True + assert abs(info["duration"] - 0.5) < 0.1 + ok(f"probe video+audio mp4: {info}") + + # Video-only mp4 → has_audio False, exercises the auto-degrade path + v_only = tmp / "probe_vonly.mp4" + _sp.run([ + "ffmpeg", "-y", "-hide_banner", "-loglevel", "error", + "-f", "lavfi", "-i", "color=c=red:s=320x240:r=24:d=0.5", + "-an", "-t", "0.5", + "-c:v", "libx264", "-pix_fmt", "yuv420p", + str(v_only), + ], check=True) + info = probe_streams(v_only) + assert info["has_video"] is True and info["has_audio"] is False + ok(f"probe video-only mp4: {info}") + + # Garbage input → SystemExit, not a silent pass + junk = tmp / "junk.mp4" + junk.write_bytes(b"not a media file") + try: + probe_streams(junk) + fail("probe_streams on garbage should have raised") + except SystemExit as e: + ok(f"probe_streams hard-fails on junk: {str(e)[:80]}") +else: + ok("ffmpeg not on PATH — preflight/probe_streams tests skipped") + + +print("\n=== ALL TESTS PASSED ===") diff --git a/examples/srt_driven/edit_plan.json b/examples/srt_driven/edit_plan.json new file mode 100644 index 0000000..0e15d43 --- /dev/null +++ b/examples/srt_driven/edit_plan.json @@ -0,0 +1,17 @@ +[ + { + "id": 1, + "source_start": "00:12:30,000", + "source_end": "00:12:33,000" + }, + { + "id": 2, + "source_start": "00:18:05,000", + "source_end": "00:18:09,000" + }, + { + "id": 3, + "source_start": "00:22:14,500", + "source_end": "00:22:18,000" + } +] diff --git a/examples/srt_driven/edit_plan_v2.json b/examples/srt_driven/edit_plan_v2.json new file mode 100644 index 0000000..fd7530b --- /dev/null +++ b/examples/srt_driven/edit_plan_v2.json @@ -0,0 +1,32 @@ +{ + "sources": { + "A": "raw/take_a.mp4", + "B": "raw/take_b.mp4" + }, + "voices": { + "host": "voice/host.wav", + "guest": "voice/guest.wav" + }, + "segments": [ + { + "id": 1, + "source": "A", + "source_start": "00:12:30,000", + "source_end": "00:12:33,000", + "voice": "host" + }, + { + "id": 2, + "source": "B", + "source_start": "00:18:05,000", + "source_end": "00:18:09,000", + "voice": "guest" + }, + { + "id": 3, + "source": "A", + "source_start": "00:22:14,500", + "source_end": "00:22:18,000" + } + ] +} diff --git a/examples/srt_driven/jobs.json b/examples/srt_driven/jobs.json new file mode 100644 index 0000000..88f1aef --- /dev/null +++ b/examples/srt_driven/jobs.json @@ -0,0 +1,25 @@ +[ + { + "name": "promo_en", + "source": "raw/take_a.mp4", + "srt": "script.srt", + "plan": "edit_plan.json", + "bg_volume": 0.0, + "style": "bold-uppercase" + }, + { + "name": "promo_cn", + "source": "raw/take_a.mp4", + "srt": "script_cjk.srt", + "plan": "edit_plan.json", + "bg_volume": 0.1, + "style": "cjk-natural" + }, + { + "name": "promo_multi", + "srt": "script.srt", + "plan": "edit_plan_v2.json", + "bg_volume": 0.0, + "style": "auto" + } +] diff --git a/examples/srt_driven/script.srt b/examples/srt_driven/script.srt new file mode 100644 index 0000000..dde4617 --- /dev/null +++ b/examples/srt_driven/script.srt @@ -0,0 +1,11 @@ +1 +00:00:00,000 --> 00:00:03,000 +Ninety percent of what an agent does is wasted. + +2 +00:00:03,000 --> 00:00:07,000 +We rewrote the planner from scratch this quarter. + +3 +00:00:08,500 --> 00:00:12,000 +Here is what changed and what it cost us. diff --git a/examples/srt_driven/script_cjk.srt b/examples/srt_driven/script_cjk.srt new file mode 100644 index 0000000..04dc35f --- /dev/null +++ b/examples/srt_driven/script_cjk.srt @@ -0,0 +1,11 @@ +1 +00:00:00,000 --> 00:00:03,000 +百分之九十的 agent 工作都被浪费了。 + +2 +00:00:03,000 --> 00:00:07,000 +我们这季度把 planner 重写了一遍。 + +3 +00:00:08,500 --> 00:00:12,000 +这里讲一下改了什么、代价是什么。 diff --git a/helpers/recommend_edit_plan.py b/helpers/recommend_edit_plan.py new file mode 100644 index 0000000..7db1c21 --- /dev/null +++ b/helpers/recommend_edit_plan.py @@ -0,0 +1,630 @@ +"""Recommend an edit_plan.json from script.srt + source transcript. + +Pipeline position: + script.srt + transcript.json + --(this script)--> + edit_plan.json + edit_plan_review.md + --(srt_driven_edit.py)--> + final.mp4 + +Matching is best-effort LEXICAL (no LLM, no semantic understanding): + 1. Parse Scribe JSON → keep only timestamped 'word' tokens. Without + word-level start/end timestamps we cannot produce reliable + source_start / source_end, so plain-text transcripts are not usable. + 2. Build candidate ranges by breaking on sentence-end punctuation, + silences ≥ gap_threshold, or speaker change; split long candidates + at phrase punctuation then by hard word-level windows. + 3. For each SRT cue, score every candidate by: + 0.6 * SequenceMatcher(normalized chars) + + 0.4 * Jaccard (token-level for Latin / 2-gram for CJK) + blended with duration similarity at 0.7 / 0.3. + The matcher cannot understand storyline — if the SRT narration uses + words not present in the source transcript, scores will be low and + matches will need manual review. + 4. Greedy assignment, no reuse unless --allow-reuse. + 5. Emit Form-A or Form-B plan + a sidecar review markdown. + +Reserved CLI flags (placeholders, not yet wired up): + --packed takes_packed.md input (use --transcript for now) + --context-window padding around matched ranges + +Usage: + python helpers/recommend_edit_plan.py \\ + --script script.srt \\ + --transcript edit/transcripts/source.json \\ + --source source.mp4 \\ + -o edit_plan.json + python helpers/srt_driven_edit.py \\ + --source source.mp4 --srt script.srt --plan edit_plan.json -o final.mp4 +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +from dataclasses import dataclass, field +from difflib import SequenceMatcher +from pathlib import Path + +try: + from srt_driven_edit import ( + parse_srt as _parse_srt, + format_srt_ts, + CJK_RE, + SrtCue, # only for type hints + ) +except Exception as e: + raise SystemExit( + "recommend_edit_plan: failed to import from srt_driven_edit.py. " + f"Both files must be importable from the same helpers/ dir. ({e})" + ) + + +# ============================================================================ +# Candidate parsing +# ============================================================================ + + +SENT_END_PUNCT = set(".?!。?!") +PHRASE_PUNCT = set(",;:,;:、") + + +@dataclass +class Candidate: + start: float + end: float + text: str + + @property + def duration(self) -> float: + return self.end - self.start + + +def load_transcript_words(path: Path, keep_audio_events: bool = False) -> list[dict]: + """Return Scribe word tokens with valid timestamps. Optionally keep audio events.""" + try: + data = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as e: + raise SystemExit(f"transcript not valid JSON: {path}: {e}") + words = data.get("words") + if not isinstance(words, list): + raise SystemExit(f"transcript missing 'words' list: {path}") + out: list[dict] = [] + for w in words: + wt = w.get("type") + if wt == "word": + if w.get("start") is None or w.get("end") is None: + continue + out.append(w) + elif wt == "audio_event" and keep_audio_events: + out.append(w) + if not out: + raise SystemExit(f"transcript has no usable word tokens: {path}") + return out + + +def _join_words(words: list[dict]) -> str: + """Concatenate word texts. Single space between. CJK joiners are removed + again at normalize time so this is safe even when neighbors are Chinese.""" + return " ".join((w.get("text") or "").strip() for w in words if (w.get("text") or "").strip()) + + +def _hard_split(part: list[dict], max_dur: float) -> list[Candidate]: + """Walk word-by-word, close a chunk as soon as adding the next word would + exceed max_dur. Every emitted chunk lands on a word boundary by construction. + """ + out: list[Candidate] = [] + chunk: list[dict] = [] + cs = float(part[0]["start"]) + for w in part: + we = float(w["end"]) + if chunk and (we - cs) > max_dur: + ce = float(chunk[-1]["end"]) + out.append(Candidate(cs, ce, _join_words(chunk))) + chunk = [] + cs = float(w["start"]) + chunk.append(w) + if chunk: + out.append(Candidate(cs, float(chunk[-1]["end"]), _join_words(chunk))) + return out + + +def build_candidates( + words: list[dict], + *, + gap_threshold: float = 0.5, + max_dur: float = 12.0, + min_dur: float = 0.4, +) -> list[Candidate]: + """Group words into phrase-level candidates. Non-overlapping by construction.""" + # Step 1: raw groups by sentence-end punct / silence / speaker change + raw_groups: list[list[dict]] = [] + current: list[dict] = [] + prev_end: float | None = None + prev_speaker: str | None = None + for w in words: + if w.get("type") != "word": + continue + text = (w.get("text") or "").strip() + if not text: + continue + ws = float(w["start"]) + we = float(w["end"]) + speaker = w.get("speaker_id") + if prev_speaker is not None and speaker is not None and speaker != prev_speaker: + if current: + raw_groups.append(current); current = [] + if prev_end is not None and (ws - prev_end) >= gap_threshold: + if current: + raw_groups.append(current); current = [] + current.append(w) + prev_end = we + prev_speaker = speaker + if text[-1] in SENT_END_PUNCT: + raw_groups.append(current); current = [] + if current: + raw_groups.append(current) + + # Step 2: split groups that exceed max_dur — phrase punct first, then hard + out: list[Candidate] = [] + for group in raw_groups: + if not group: + continue + start = float(group[0]["start"]) + end = float(group[-1]["end"]) + if end - start <= max_dur: + out.append(Candidate(start, end, _join_words(group))) + continue + parts: list[list[dict]] = [] + buf: list[dict] = [] + for w in group: + buf.append(w) + text = (w.get("text") or "").strip() + if text and text[-1] in PHRASE_PUNCT: + parts.append(buf); buf = [] + if buf: + parts.append(buf) + for part in parts: + ps = float(part[0]["start"]); pe = float(part[-1]["end"]) + if pe - ps <= max_dur: + out.append(Candidate(ps, pe, _join_words(part))) + else: + out.extend(_hard_split(part, max_dur)) + + return [c for c in out if c.duration >= min_dur] + + +# ============================================================================ +# Scoring +# ============================================================================ + + +# Keep word characters, whitespace, and CJK ranges; replace everything else +# (punctuation, brackets, audio-event markers) with a space. +_NORMALIZE_RE = re.compile( + r"[^\w\s一-鿿㐀-䶿぀-ゟ゠-ヿ가-힯]+", + flags=re.UNICODE, +) +_WS_RE = re.compile(r"\s+") + + +def normalize_text(text: str) -> str: + s = text.casefold() + s = _NORMALIZE_RE.sub(" ", s) + s = _WS_RE.sub(" ", s).strip() + return s + + +def is_cjk_heavy(text: str) -> bool: + """True if at least half of the non-whitespace characters are CJK.""" + chars = [c for c in text if not c.isspace()] + if not chars: + return False + cjk = sum(1 for c in chars if CJK_RE.match(c)) + return cjk * 2 >= len(chars) + + +def _tokens(text: str) -> list[str]: + return text.split() + + +def _char_bigrams(text: str) -> set[str]: + chars = [c for c in text if not c.isspace()] + return {"".join(chars[i:i + 2]) for i in range(len(chars) - 1)} + + +def _jaccard(a: set | list, b: set | list) -> float: + sa, sb = set(a), set(b) + if not sa and not sb: + return 1.0 + if not sa or not sb: + return 0.0 + return len(sa & sb) / len(sa | sb) + + +def text_similarity(cue_text: str, cand_text: str) -> float: + """Blend of SequenceMatcher (local structure) and Jaccard (bag of units).""" + a = normalize_text(cue_text) + b = normalize_text(cand_text) + if not a or not b: + return 0.0 + seq = SequenceMatcher(None, a, b, autojunk=False).ratio() + if is_cjk_heavy(a) or is_cjk_heavy(b): + jc = _jaccard(_char_bigrams(a), _char_bigrams(b)) + else: + jc = _jaccard(_tokens(a), _tokens(b)) + return 0.6 * seq + 0.4 * jc + + +def duration_similarity(cand_dur: float, cue_dur: float) -> float: + if cue_dur <= 0: + return 0.0 + delta = abs(cand_dur - cue_dur) + return 1.0 / (1.0 + delta / cue_dur) + + +def combined_score(cue: SrtCue, cand: Candidate, + w_text: float = 0.7, w_dur: float = 0.3) -> float: + return ( + w_text * text_similarity(cue.text, cand.text) + + w_dur * duration_similarity(cand.duration, cue.duration) + ) + + +# ============================================================================ +# Assignment +# ============================================================================ + + +@dataclass +class Assignment: + cue_id: int + cue_text: str + cue_duration: float + cand: Candidate | None + score: float + warnings: list[str] = field(default_factory=list) + + +def assign( + cues: list[SrtCue], + candidates: list[Candidate], + *, + allow_reuse: bool = False, + min_score: float = 0.35, + duration_warn_ratio: float = 0.5, + monotonic_source: bool = False, + max_source_gap_warn: float | None = None, +) -> list[Assignment]: + """Pick the best candidate for each cue in id order. + + monotonic_source: when True, a candidate is only considered if its + start time is >= the previously assigned candidate's end. Prevents + narrative time reversal when the same line appears multiple times + in the source (the matcher can otherwise pick an earlier instance + for a later cue). + + max_source_gap_warn: if set, any adjacent assignment pair whose + absolute source-time gap exceeds the threshold gets a warning. + Soft signal — does not affect selection. + + Even in non-monotonic mode, a backward source-time jump always + earns a warning so the review markdown surfaces it. + """ + used: set[int] = set() + out: list[Assignment] = [] + # Floor that the NEXT candidate's start must clear under monotonic mode. + min_start_floor = 0.0 + + for cue in cues: + best_idx = -1 + best_score = -1.0 + for i, cand in enumerate(candidates): + if not allow_reuse and i in used: + continue + if monotonic_source and cand.start < min_start_floor - 1e-6: + continue + s = combined_score(cue, cand) + if s > best_score: + best_score = s + best_idx = i + + warns: list[str] = [] + cand_out: Candidate | None = None + if best_idx < 0: + if monotonic_source: + warns.append( + f"no candidate available at or after source time " + f"{format_srt_ts(min_start_floor)} (monotonic constraint)" + ) + else: + warns.append("no candidate available") + score_out = 0.0 + else: + cand_out = candidates[best_idx] + score_out = best_score + if not allow_reuse: + used.add(best_idx) + if monotonic_source: + # Next cue must start at or after this candidate's end. + min_start_floor = cand_out.end + if best_score < min_score: + warns.append(f"low score {best_score:.3f} < {min_score}") + if cue.duration > 0: + dd_ratio = abs(cand_out.duration - cue.duration) / cue.duration + if dd_ratio > duration_warn_ratio: + warns.append( + f"duration mismatch: cand {cand_out.duration:.2f}s vs " + f"cue {cue.duration:.2f}s ({dd_ratio:.0%} off)" + ) + if cand_out.duration + 1e-6 < cue.duration: + warns.append( + "candidate shorter than cue — will need `--on-short pad` " + "in srt_driven_edit" + ) + out.append(Assignment( + cue_id=cue.id, cue_text=cue.text, cue_duration=cue.duration, + cand=cand_out, score=score_out, warnings=warns, + )) + + # Post-pass: surface source-time discontinuities as warnings on the + # later cue of the pair. Backward jumps are flagged in non-monotonic + # mode (impossible by construction in monotonic mode). Large gaps are + # flagged in both modes when --max-source-gap is set. + for i in range(1, len(out)): + prev_cand = out[i - 1].cand + curr_cand = out[i].cand + if prev_cand is None or curr_cand is None: + continue + gap = curr_cand.start - prev_cand.end + if not monotonic_source and gap < -1e-3: + out[i].warnings.append( + f"source time goes backward {gap:+.2f}s: prev cue ends at " + f"{format_srt_ts(prev_cand.end)}, this cue starts at " + f"{format_srt_ts(curr_cand.start)}" + ) + if max_source_gap_warn is not None and abs(gap) > max_source_gap_warn: + out[i].warnings.append( + f"source-time jump {gap:+.2f}s exceeds " + f"--max-source-gap {max_source_gap_warn:.2f}s" + ) + + return out + + +# ============================================================================ +# Output writers +# ============================================================================ + + +def _require_all_assigned(assignments: list[Assignment]) -> None: + missing = [a.cue_id for a in assignments if a.cand is None] + if missing: + raise SystemExit( + f"no candidate found for cue(s) {missing}. " + "Add transcript coverage, lower --gap-threshold, or pass --allow-reuse." + ) + + +def write_plan_form_a(assignments: list[Assignment], out_path: Path) -> None: + _require_all_assigned(assignments) + rows = [ + { + "id": a.cue_id, + "source_start": format_srt_ts(a.cand.start), + "source_end": format_srt_ts(a.cand.end), + } + for a in assignments + ] + out_path.write_text(json.dumps(rows, indent=2, ensure_ascii=False), encoding="utf-8") + + +def write_plan_form_b( + assignments: list[Assignment], + source_path: Path, + source_name: str, + out_path: Path, +) -> None: + _require_all_assigned(assignments) + data = { + "sources": {source_name: str(source_path)}, + "segments": [ + { + "id": a.cue_id, + "source": source_name, + "source_start": format_srt_ts(a.cand.start), + "source_end": format_srt_ts(a.cand.end), + } + for a in assignments + ], + } + out_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") + + +def write_review(assignments: list[Assignment], out_path: Path) -> None: + lines: list[str] = ["# Edit plan review", ""] + total = len(assignments) + matched = sum(1 for a in assignments if a.cand is not None) + warned = sum(1 for a in assignments if a.warnings) + avg = (sum(a.score for a in assignments if a.cand) / max(matched, 1)) + lines.append(f"- total cues: {total}") + lines.append(f"- matched: {matched}/{total}") + lines.append(f"- with warnings: {warned}") + lines.append(f"- average score: {avg:.3f}") + lines.append("") + for a in assignments: + lines.append(f"## cue id={a.cue_id}") + lines.append(f"- **cue text**: {a.cue_text!r}") + lines.append(f"- **cue duration**: {a.cue_duration:.3f}s") + if a.cand is None: + lines.append("- **match**: NONE") + else: + lines.append(f"- **matched text**: {a.cand.text!r}") + lines.append( + f"- **source range**: {format_srt_ts(a.cand.start)} → " + f"{format_srt_ts(a.cand.end)} ({a.cand.duration:.3f}s)" + ) + lines.append(f"- **score**: {a.score:.3f}") + dd = a.cand.duration - a.cue_duration + lines.append(f"- **duration delta**: {dd:+.3f}s") + for w in a.warnings: + lines.append(f"- **WARNING**: {w}") + lines.append("") + out_path.write_text("\n".join(lines), encoding="utf-8") + + +# ============================================================================ +# Top-level callable (used by CLI and tests) +# ============================================================================ + + +def recommend( + *, + script_srt: Path, + transcript: Path, + source: Path, + output: Path, + review: Path | None = None, + source_name: str = "A", + output_format: str = "form-a", + gap_threshold: float = 0.5, + max_cand_dur: float = 12.0, + min_cand_dur: float = 0.4, + min_score: float = 0.35, + allow_reuse: bool = False, + keep_audio_events: bool = False, + monotonic_source: bool = False, + max_source_gap_warn: float | None = None, +) -> list[Assignment]: + cues = _parse_srt(script_srt) + if not cues: + raise SystemExit(f"script.srt has no cues: {script_srt}") + + words = load_transcript_words(transcript, keep_audio_events=keep_audio_events) + candidates = build_candidates( + words, + gap_threshold=gap_threshold, + max_dur=max_cand_dur, + min_dur=min_cand_dur, + ) + if not candidates: + raise SystemExit( + f"no candidates built from transcript {transcript}. " + "Try lowering --min-cand-dur or check transcript quality." + ) + + assignments = assign( + cues, candidates, + allow_reuse=allow_reuse, min_score=min_score, + monotonic_source=monotonic_source, + max_source_gap_warn=max_source_gap_warn, + ) + + if output_format == "form-a": + write_plan_form_a(assignments, output) + elif output_format == "form-b": + write_plan_form_b(assignments, source, source_name, output) + else: + raise SystemExit(f"unknown --format: {output_format}") + + if review is None: + review = output.with_name(output.stem + "_review.md") + write_review(assignments, review) + return assignments + + +# ============================================================================ +# CLI +# ============================================================================ + + +def main() -> None: + ap = argparse.ArgumentParser( + description="Recommend edit_plan.json from script.srt + Scribe transcript", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=( + "Example:\n" + " python helpers/recommend_edit_plan.py \\\n" + " --script script.srt \\\n" + " --transcript edit/transcripts/source.json \\\n" + " --source source.mp4 \\\n" + " -o edit_plan.json\n" + " python helpers/srt_driven_edit.py \\\n" + " --source source.mp4 --srt script.srt --plan edit_plan.json -o final.mp4" + ), + ) + ap.add_argument("--script", type=Path, required=True, + help="script.srt (target captions timeline)") + ap.add_argument("--transcript", type=Path, required=True, + help="Scribe transcript JSON") + ap.add_argument("--source", type=Path, required=True, + help="source.mp4 path (recorded in Form-B plans)") + ap.add_argument("--packed", type=Path, default=None, + help="optional takes_packed.md (reserved; unused in v1)") + ap.add_argument("--source-name", default="A", + help="Form-B source name (default 'A')") + ap.add_argument("--context-window", type=float, default=1.5, + help="reserved for future use") + ap.add_argument("--gap-threshold", type=float, default=0.5, + help="silence gap (s) that breaks a candidate. default 0.5") + ap.add_argument("--max-cand-dur", type=float, default=12.0, + help="max candidate duration before forced split. default 12.0") + ap.add_argument("--min-cand-dur", type=float, default=0.4, + help="drop candidates shorter than this. default 0.4") + ap.add_argument("--min-score", type=float, default=0.35, + help="score below this triggers a warning. default 0.35") + ap.add_argument("--allow-reuse", action="store_true", + help="allow one candidate to be assigned to multiple cues") + ap.add_argument("--keep-audio-events", action="store_true", + help="keep (laughter) (applause) tokens as candidate context") + ap.add_argument("--monotonic-source", action="store_true", + help="require each cue's source range to start at or after " + "the previous cue's match. Prevents narrative time " + "reversal when the same line appears multiple times " + "in the source.") + ap.add_argument("--max-source-gap", type=float, default=None, + help="seconds. When set, any adjacent assignment whose " + "|source-time gap| exceeds this earns a warning.") + ap.add_argument("--format", choices=["form-a", "form-b"], default="form-a", + dest="output_format") + ap.add_argument("-o", "--output", type=Path, required=True, + help="edit_plan.json path") + ap.add_argument("--review", type=Path, default=None, + help="review .md path (default: _review.md)") + args = ap.parse_args() + + assignments = recommend( + script_srt=args.script.resolve(), + transcript=args.transcript.resolve(), + source=args.source.resolve(), + output=args.output.resolve(), + review=args.review.resolve() if args.review else None, + source_name=args.source_name, + output_format=args.output_format, + gap_threshold=args.gap_threshold, + max_cand_dur=args.max_cand_dur, + min_cand_dur=args.min_cand_dur, + min_score=args.min_score, + allow_reuse=args.allow_reuse, + keep_audio_events=args.keep_audio_events, + monotonic_source=args.monotonic_source, + max_source_gap_warn=args.max_source_gap, + ) + + matched = sum(1 for a in assignments if a.cand is not None) + warned = sum(1 for a in assignments if a.warnings) + avg = sum(a.score for a in assignments if a.cand is not None) / max(matched, 1) + review_path = ( + args.review.resolve() if args.review + else args.output.resolve().with_name(args.output.stem + "_review.md") + ) + print(f"wrote plan → {args.output}") + print(f"wrote review → {review_path}") + print(f" {matched}/{len(assignments)} cues matched, avg score {avg:.3f}, " + f"{warned} with warnings") + + +if __name__ == "__main__": + main() diff --git a/helpers/run_episodes.py b/helpers/run_episodes.py new file mode 100644 index 0000000..6047c85 --- /dev/null +++ b/helpers/run_episodes.py @@ -0,0 +1,248 @@ +"""Run srt_driven_edit across every episode subdirectory under a root. + +Discovery convention (flat per-episode layout): + //source.mp4 required + //script.srt required + //edit_plan.json required (Form A or B) + //voice.wav optional (global voice for this ep) + +Outputs: + //final.mp4 + //edit/... (EDL, QC report, cache — managed by srt_driven_edit) + /run_episodes_summary.json + +Usage: + python helpers/run_episodes.py batch/ + python helpers/run_episodes.py batch/ --bg-volume 0.1 --style cjk-natural + python helpers/run_episodes.py batch/ --continue-on-error +""" + +from __future__ import annotations + +import argparse +import json +import time +from dataclasses import dataclass +from pathlib import Path + +try: + from srt_driven_edit import ( + Job, run_job, preflight, safe_ascii_name, + make_failure_record, + ) +except Exception as e: + raise SystemExit( + "run_episodes: failed to import from srt_driven_edit.py. " + f"Both files must be importable from the same helpers/ dir. ({e})" + ) + + +REQUIRED_FILES = ("source.mp4", "script.srt", "edit_plan.json") +OPTIONAL_VOICE = "voice.wav" + + +@dataclass +class EpisodeJob: + name: str + root: Path + source: Path + srt: Path + plan: Path + voice: Path | None + + +def discover_episodes(root: Path) -> list[EpisodeJob]: + """Return episode dirs under `root` that have the required file set. + + Dirs missing a required file are skipped with a printed reason — never + cause a hard failure here, so a partial batch is still actionable. + Hard-fails only if NO usable dir is found. + """ + if not root.is_dir(): + raise SystemExit(f"not a directory: {root}") + + eps: list[EpisodeJob] = [] + skipped: list[tuple[str, list[str]]] = [] + for sub in sorted(root.iterdir(), key=lambda p: p.name): + if not sub.is_dir(): + continue + missing = [f for f in REQUIRED_FILES if not (sub / f).is_file()] + if missing: + skipped.append((sub.name, missing)) + continue + voice = sub / OPTIONAL_VOICE + eps.append(EpisodeJob( + name=sub.name, + root=sub.resolve(), + source=(sub / "source.mp4").resolve(), + srt=(sub / "script.srt").resolve(), + plan=(sub / "edit_plan.json").resolve(), + voice=voice.resolve() if voice.is_file() else None, + )) + + if skipped: + print(f"skipped {len(skipped)} dir(s) missing required files:") + for name, miss in skipped: + print(f" {name}: missing {', '.join(miss)}") + if not eps: + raise SystemExit( + f"no usable episode dirs under {root}. Each ep dir needs: " + f"{list(REQUIRED_FILES)}" + ) + return eps + + +def _make_job(ep: EpisodeJob, opts: dict) -> Job: + return Job( + source=ep.source, + srt=ep.srt, + plan=ep.plan, + voice=ep.voice, + bg_volume=opts["bg_volume"], + tolerance=opts["tolerance"], + trim_direction=opts["trim_direction"], + on_short=opts["on_short"], + style=opts["style"], + fontsdir=opts["fontsdir"], + output=ep.root / "final.mp4", + name=ep.name, + no_cache=opts["no_cache"], + keep_intermediates=opts["keep_intermediates"], + no_overwrite=opts["no_overwrite"], + mode=opts.get("mode", "full"), + ) + + +def run_episodes( + root: Path, + *, + ffmpeg_version: str, + bg_volume: float = 0.0, + tolerance: float = 0.5, + trim_direction: str = "tail", + on_short: str = "error", + style: str = "auto", + fontsdir: Path | None = None, + no_cache: bool = False, + no_overwrite: bool = False, + keep_intermediates: bool = False, + continue_on_error: bool = False, + mode: str = "full", +) -> dict: + """Discover + run every episode under `root`. Returns a summary dict and + also writes it to `/run_episodes_summary.json`.""" + root = root.resolve() + eps = discover_episodes(root) + print(f"\ndiscovered {len(eps)} episode(s) under {root}:") + for ep in eps: + print(f" {ep.name} voice={'yes' if ep.voice else 'no'}") + + opts = { + "bg_volume": bg_volume, + "tolerance": tolerance, + "trim_direction": trim_direction, + "on_short": on_short, + "style": style, + "fontsdir": fontsdir, + "no_cache": no_cache, + "no_overwrite": no_overwrite, + "keep_intermediates": keep_intermediates, + "mode": mode, + } + + results: list[dict] = [] + t0 = time.time() + for i, ep in enumerate(eps): + print(f"\n[{i + 1}/{len(eps)}] === {ep.name} ===") + job = _make_job(ep, opts) + try: + qc = run_job(job, ffmpeg_version) + results.append(qc) + except (SystemExit, Exception) as e: + if continue_on_error: + print(f"[{i + 1}/{len(eps)}] FAILED: " + f"{type(e).__name__}: {e}") + results.append(make_failure_record( + index=i, name=ep.name, error=e, job=job, + )) + continue + raise + + ok = sum(1 for r in results if r.get("ok")) + summary = { + "root": str(root), + "episodes_total": len(eps), + "ok": ok, + "elapsed_s": round(time.time() - t0, 2), + "results": results, + } + summary_path = root / "run_episodes_summary.json" + summary_path.write_text( + json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8" + ) + print(f"\n{ok}/{len(results)} episodes ok ({summary['elapsed_s']}s)") + print(f"summary → {summary_path}") + return summary + + +def main() -> None: + ap = argparse.ArgumentParser( + description="Run srt_driven_edit across every ep*/ subdirectory.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=( + "Per-episode layout:\n" + " //source.mp4 required\n" + " //script.srt required\n" + " //edit_plan.json required (Form A or B)\n" + " //voice.wav optional\n\n" + "Outputs land at //final.mp4 with edit/ artifacts." + ), + ) + ap.add_argument("root", type=Path, + help="directory whose immediate subdirs are episodes") + ap.add_argument("--bg-volume", type=float, default=0.0) + ap.add_argument("--tolerance", type=float, default=0.5) + ap.add_argument("--trim-direction", choices=["tail", "head", "center"], default="tail") + ap.add_argument("--on-short", choices=["error", "pad"], default="error") + ap.add_argument("--style", default="auto") + ap.add_argument("--fontsdir", type=Path, default=None) + ap.add_argument("--no-cache", action="store_true") + ap.add_argument("--no-overwrite", action="store_true") + ap.add_argument("--keep-intermediates", action="store_true") + ap.add_argument("--continue-on-error", action="store_true", + help="skip episodes that fail instead of aborting") + ap.add_argument( + "--mode", choices=["full", "extract"], default="full", + help="'full' (default) runs the complete pipeline per episode. " + "'extract' stops after segment extraction and saves clips " + "under each ep's edit/ dir; gap clips, voice mixing, " + "subtitle burn, and QC report are skipped.", + ) + args = ap.parse_args() + + versions = preflight() + print(f"== ffmpeg {versions['ffmpeg']} / ffprobe {versions['ffprobe']} ==") + + summary = run_episodes( + args.root, + ffmpeg_version=versions["ffmpeg"], + bg_volume=args.bg_volume, + tolerance=args.tolerance, + trim_direction=args.trim_direction, + on_short=args.on_short, + style=args.style, + fontsdir=args.fontsdir.resolve() if args.fontsdir else None, + no_cache=args.no_cache, + no_overwrite=args.no_overwrite, + keep_intermediates=args.keep_intermediates, + continue_on_error=args.continue_on_error, + mode=args.mode, + ) + # Exit nonzero if any episode failed (even with --continue-on-error, + # the caller probably wants to know). + if summary["ok"] < summary["episodes_total"]: + raise SystemExit(1) + + +if __name__ == "__main__": + main() diff --git a/helpers/srt_driven_edit.py b/helpers/srt_driven_edit.py new file mode 100644 index 0000000..7a65c86 --- /dev/null +++ b/helpers/srt_driven_edit.py @@ -0,0 +1,1704 @@ +"""SRT-driven edit: assemble a final cut by aligning source ranges to an SRT. + +Independent pipeline. Does NOT touch the main render.py flow. Use when you +have a finished script (script.srt = final captions timeline) and a list of +source ranges keyed by SRT id. + +Pipeline: + parse SRT + plan ─> strict validate ─> align ─> resolve style + ─> extract segments (with cache) ─> insert gap clips ─> concat + ─> audio replace/mix + subtitle burn LAST (Hard Rule 1) ─> QC report + +Schemas (both forms accepted): + + Form A — array, single source (legacy): + [{"id": 1, "source_start": "HH:MM:SS,ms", "source_end": "HH:MM:SS,ms"}, ...] + + CLI --source + + Form B — object, multi-source / multi-voice: + { + "sources": {"A": "path/a.mp4", "B": "path/b.mp4"}, + "voices": {"main": "path/v.wav"}, + "segments": [ + {"id": 1, "source": "A", "source_start": "...", "source_end": "...", + "voice": "main"}, + {"id": 2, "source": "B", "source_start": "...", "source_end": "..."} + ] + } + +Batch: + --batch jobs.json (array of per-job dicts, same fields as CLI flags) + --batch jobs.csv (header row of the same fields) +""" + +from __future__ import annotations + +import argparse +import csv +import hashlib +import json +import os +import re +import shutil +import subprocess +import sys +import tempfile +import time +from dataclasses import dataclass, field, asdict +from pathlib import Path +from typing import Any + +try: + from render import ( + SUB_FORCE_STYLE as _RENDER_SUB_STYLE, + TONEMAP_CHAIN, + is_hdr_source, + is_portrait_source, + ) +except Exception: + _RENDER_SUB_STYLE = ( + "FontName=Helvetica,FontSize=18,Bold=1," + "PrimaryColour=&H00FFFFFF,OutlineColour=&H00000000,BackColour=&H00000000," + "BorderStyle=1,Outline=2,Shadow=0," + "Alignment=2,MarginV=90" + ) + TONEMAP_CHAIN = "" + + def is_hdr_source(video: Path) -> bool: # type: ignore + return False + + def is_portrait_source(video: Path) -> bool: # type: ignore + return False + + +# ============================================================================ +# Constants +# ============================================================================ + +FPS = 24 +SAMPLE_RATE = 48000 +AUDIO_BITRATE = "192k" +DURATION_DRIFT_TOLERANCE_S = 0.2 + +STYLE_TEMPLATES: dict[str, str] = { + "bold-uppercase": _RENDER_SUB_STYLE, + "cjk-natural": ( + "FontName=Microsoft YaHei UI,FontSize=20,Bold=0," + "PrimaryColour=&H00FFFFFF,OutlineColour=&H00000000,BackColour=&H00000000," + "BorderStyle=1,Outline=2,Shadow=0," + "Alignment=2,MarginV=90" + ), + "narrative": ( + "FontName=Helvetica,FontSize=20,Bold=0," + "PrimaryColour=&H00FFFFFF,OutlineColour=&H00000000,BackColour=&H00000000," + "BorderStyle=1,Outline=2,Shadow=0," + "Alignment=2,MarginV=80" + ), +} + +CJK_RE = re.compile( + r"[一-鿿㐀-䶿぀-ゟ゠-ヿ가-힯]" +) + +CACHE_VERSION = 2 # bumped: cache now keyed by ffmpeg version + encoding params + +# Encoding-affecting constants captured into a single fingerprint so that +# any later tweak to codec / preset / sync tails forces a cache miss. If you +# change PARAMS_FINGERPRINT's inputs, existing cached clips are auto-invalidated. +def _params_fingerprint() -> str: + payload = repr([ + "fps", 24, + "sr", 48000, + "ab", "192k", + "ac", 2, + "v_codec", "libx264", "preset", "fast", "crf", 20, "pix", "yuv420p", + "a_codec", "aac", + ]) + return hashlib.sha1(payload.encode("utf-8")).hexdigest()[:10] + + +# Encodings tried in order when reading user-supplied SRT files. Windows +# Chinese systems frequently save as GBK/GB18030; macOS / *nix typically +# UTF-8 (with or without BOM). cp1252 is the last-resort Western Latin1. +SRT_ENCODINGS = ("utf-8-sig", "utf-8", "gb18030", "cp936", "cp1252") + +# Audio/video sync tails appended to every per-segment filter chain so that +# each extracted clip starts at PTS 0 with monotonic timestamps. Without +# these, concatenating many short clips accumulates sub-frame drift that +# eventually desyncs voice from picture. +V_SYNC_TAIL = f"fps={FPS},setpts=PTS-STARTPTS" +A_SYNC_TAIL = "aresample=async=1:first_pts=0,asetpts=PTS-STARTPTS" + +PARAMS_FINGERPRINT = _params_fingerprint() + + +# ============================================================================ +# Path / filter escaping +# ============================================================================ + + +def subs_filter_escape(path: Path) -> str: + """Escape a path for use inside ffmpeg's subtitles='...' filter argument. + + Order matters: backslashes first (Windows), then drive-letter colons, then + quotes. The path is returned in forward-slash form for libavfilter sanity. + """ + s = path.resolve().as_posix() + s = s.replace("\\", "\\\\") + s = s.replace(":", r"\:") + s = s.replace("'", r"\'") + return s + + +def safe_ascii_name(stem: str) -> str: + """Reduce a filename stem to a safe ASCII slug for intermediate files.""" + s = re.sub(r"[^A-Za-z0-9._-]+", "_", stem) + s = s.strip("_") or "job" + return s[:48] + + +def concat_quote_path(p: Path) -> str: + """Quote a path for ffmpeg's concat demuxer 'file' directive. + + Embeds single quotes via the close-escape-reopen idiom: `'` -> `'\\''`. + Paths are normalized to posix form so backslashes do not become escape + sequences when libavformat parses the list. + """ + s = p.resolve().as_posix() + escaped = s.replace("'", "'\\''") + return f"'{escaped}'" + + +def read_srt_text(path: Path) -> str: + """Read an SRT with encoding fallback. + + Tries SRT_ENCODINGS in order; returns the first successful decode. + Raises SystemExit with a helpful message if none work. + """ + raw = path.read_bytes() + last_err: Exception | None = None + for enc in SRT_ENCODINGS: + try: + return raw.decode(enc) + except UnicodeDecodeError as e: + last_err = e + continue + raise SystemExit( + f"could not decode SRT {path} with any of {SRT_ENCODINGS}: {last_err}" + ) + + +def make_safe_work_dir(job_name: str, plan_path: Path) -> Path: + """Create (or reset) a safe ASCII-named temp dir for one job's intermediates. + + Lives under tempfile.gettempdir() so it never inherits CJK / quote / + space characters from the user's project path. Deterministic hash means + re-runs land in the same dir for debuggability. + """ + h = hashlib.sha1( + f"{plan_path.resolve().as_posix()}|{job_name}".encode("utf-8") + ).hexdigest()[:12] + p = Path(tempfile.gettempdir()) / f"srt_edit_{h}" + if p.exists(): + shutil.rmtree(p, ignore_errors=True) + p.mkdir(parents=True) + return p + + +def _path_is_filter_safe(p: Path) -> bool: + """Cheap libavfilter-path safety check: ASCII only and no single quotes.""" + s = str(p) + return s.isascii() and "'" not in s + + +def ensure_safe_subs_path(src: Path) -> tuple[Path, Path | None]: + """Return (path_to_feed_to_ffmpeg, cleanup_target_or_None). + + If src is already filter-safe, return it as-is and no cleanup target. + Otherwise copy to a deterministic ASCII path under the system temp dir + and return that, plus a handle the caller should unlink in finally. + + Decoded through read_srt_text so GB18030 / cp936 inputs become UTF-8. + """ + if _path_is_filter_safe(src): + return src, None + h = hashlib.sha1(src.resolve().as_posix().encode("utf-8")).hexdigest()[:12] + safe = Path(tempfile.gettempdir()) / f"srt_burn_{h}.srt" + safe.write_text(read_srt_text(src), encoding="utf-8") + return safe, safe + + +# ============================================================================ +# Preflight: tool availability + media stream probing +# ============================================================================ + + +_FFMPEG_VERSION_RE = re.compile(r"^ffmpeg version (\S+)") +_FFPROBE_VERSION_RE = re.compile(r"^ffprobe version (\S+)") + + +def preflight() -> dict[str, str]: + """Verify ffmpeg + ffprobe are on PATH and runnable. Return version dict. + + Used both for early failure and to fingerprint cache keys: encoding + behavior can shift between ffmpeg versions, so a version bump should + invalidate cached clips. + """ + info: dict[str, str] = {} + for tool, rx in (("ffmpeg", _FFMPEG_VERSION_RE), ("ffprobe", _FFPROBE_VERSION_RE)): + try: + r = subprocess.run( + [tool, "-version"], + capture_output=True, text=True, timeout=10, + encoding="utf-8", errors="replace", + ) + except FileNotFoundError: + raise SystemExit( + f"required tool not on PATH: {tool}. Install ffmpeg first " + f"(e.g. `winget install Gyan.FFmpeg` on Windows, " + f"`brew install ffmpeg` on macOS)." + ) + except subprocess.TimeoutExpired: + raise SystemExit(f"{tool} timed out on `-version`. Bad install?") + if r.returncode != 0: + raise SystemExit( + f"{tool} `-version` exited {r.returncode}: {(r.stderr or '')[:300]}" + ) + first_line = (r.stdout.splitlines() or [""])[0].strip() + m = rx.match(first_line) + info[tool] = m.group(1) if m else first_line[:40] or "unknown" + return info + + +def probe_streams(path: Path) -> dict: + """Probe a media file for {has_video, has_audio, duration}. + + Raises SystemExit on any probe failure (binary missing, bad file, + malformed output) so the caller doesn't continue blindly. Result + is cheap to memoize per source path. + """ + try: + r = subprocess.run( + [ + "ffprobe", "-v", "error", + "-show_entries", "stream=codec_type", + "-show_entries", "format=duration", + "-of", "json", str(path), + ], + capture_output=True, text=True, check=True, + encoding="utf-8", errors="replace", + ) + except FileNotFoundError: + raise SystemExit( + "ffprobe not on PATH. Install ffmpeg " + "(`winget install Gyan.FFmpeg` / `brew install ffmpeg`)." + ) + except subprocess.CalledProcessError as e: + raise SystemExit( + f"ffprobe failed on {path}: {(e.stderr or '')[:300]}" + ) + try: + data = json.loads(r.stdout) + except json.JSONDecodeError as e: + raise SystemExit(f"ffprobe returned malformed JSON for {path}: {e}") + types: set[str] = set() + for s in data.get("streams", []) or []: + t = s.get("codec_type") + if t: + types.add(t) + fmt = data.get("format") or {} + try: + duration = float(fmt.get("duration", 0.0)) + except (TypeError, ValueError): + duration = 0.0 + return { + "has_video": "video" in types, + "has_audio": "audio" in types, + "duration": duration, + } + + +# ============================================================================ +# Time parsing +# ============================================================================ + + +_TS_RE = re.compile(r"(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})") + + +def parse_timestamp(ts: str) -> float: + m = _TS_RE.fullmatch(ts.strip()) + if not m: + raise ValueError(f"bad timestamp: {ts!r}") + h, mn, s, ms = m.groups() + return int(h) * 3600 + int(mn) * 60 + int(s) + int(ms.ljust(3, "0")) / 1000.0 + + +def format_srt_ts(seconds: float) -> str: + total_ms = int(round(seconds * 1000)) + h, rem = divmod(total_ms, 3600_000) + m, rem = divmod(rem, 60_000) + s, ms = divmod(rem, 1000) + return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" + + +# ============================================================================ +# Dataclasses +# ============================================================================ + + +@dataclass +class SrtCue: + id: int + final_start: float + final_end: float + text: str + + @property + def duration(self) -> float: + return self.final_end - self.final_start + + +@dataclass +class PlanEntry: + id: int + source_name: str # key into sources map (Form A: synthetic "_default") + source_start: float + source_end: float + voice_name: str | None = None # key into voices map + + @property + def duration(self) -> float: + return self.source_end - self.source_start + + +@dataclass +class Segment: + id: int + source_path: Path + source_start: float + source_end: float + out_start: float + out_end: float + leading_gap: float + text: str + voice_path: Path | None + pad_short: bool = False + plan_src_dur: float = 0.0 + + @property + def duration(self) -> float: + return self.out_end - self.out_start + + +# ============================================================================ +# SRT parser + validation +# ============================================================================ + + +def _split_time_line(line: str) -> tuple[str, str]: + """Split an SRT time line into (start_ts, end_ts) strings. + + Tolerates trailing cue settings like 'position:90% align:start' by + keeping only the first whitespace-delimited token on each side of '-->'. + """ + parts = line.split("-->", 1) + if len(parts) != 2: + raise ValueError(f"missing '-->' in time line: {line!r}") + left_tokens = parts[0].strip().split() + right_tokens = parts[1].strip().split() + if not left_tokens or not right_tokens: + raise ValueError(f"missing timestamps in time line: {line!r}") + return left_tokens[-1], right_tokens[0] + + +def parse_srt(path: Path) -> list[SrtCue]: + raw = read_srt_text(path) + blocks = re.split(r"\r?\n\r?\n+", raw.strip()) + cues: list[SrtCue] = [] + for block in blocks: + lines = [ln.rstrip() for ln in block.splitlines() if ln.strip() != ""] + if len(lines) < 2: + continue + try: + idx = int(lines[0].strip()) + except ValueError: + raise SystemExit(f"SRT block missing numeric id: {lines[0]!r}") + if "-->" not in lines[1]: + raise SystemExit(f"SRT block missing time line: {lines[1]!r}") + try: + a, b = _split_time_line(lines[1]) + start = parse_timestamp(a) + end = parse_timestamp(b) + except ValueError as e: + raise SystemExit(f"SRT id={lines[0]}: {e}") + cues.append(SrtCue(id=idx, final_start=start, final_end=end, + text="\n".join(lines[2:]))) + return cues + + +def validate_srt(cues: list[SrtCue]) -> None: + if not cues: + raise SystemExit("SRT has no cues") + seen: set[int] = set() + for c in cues: + if c.id in seen: + raise SystemExit(f"SRT duplicate id: {c.id}") + seen.add(c.id) + if c.final_end <= c.final_start: + raise SystemExit( + f"SRT id={c.id}: end {c.final_end:.3f} <= start {c.final_start:.3f}" + ) + if c.final_start < 0: + raise SystemExit(f"SRT id={c.id}: negative start {c.final_start:.3f}") + sorted_cues = sorted(cues, key=lambda x: x.id) + for i in range(1, len(sorted_cues)): + prev, cur = sorted_cues[i - 1], sorted_cues[i] + if cur.final_start < prev.final_start: + raise SystemExit( + f"SRT non-monotonic by id: id={cur.id} starts at " + f"{cur.final_start:.3f}s, earlier than id={prev.id} at " + f"{prev.final_start:.3f}s" + ) + if cur.final_start < prev.final_end - 1e-6: + raise SystemExit( + f"SRT cue overlap: id={prev.id} ends {prev.final_end:.3f}, " + f"id={cur.id} starts {cur.final_start:.3f}" + ) + + +# ============================================================================ +# Plan parser + validation +# ============================================================================ + + +def parse_plan(path: Path) -> tuple[dict[str, Path], dict[str, Path], list[PlanEntry]]: + """Returns (sources_map, voices_map, entries). Detects Form A vs B.""" + try: + raw = path.read_text(encoding="utf-8") + except OSError as e: + raise SystemExit(f"edit_plan unreadable: {path}: {e}") + try: + data = json.loads(raw) + except json.JSONDecodeError as e: + raise SystemExit( + f"edit_plan is not valid JSON: {path}: " + f"line {e.lineno} col {e.colno}: {e.msg}" + ) + base = path.parent + + if isinstance(data, list): + entries: list[PlanEntry] = [] + for row in data: + entries.append(PlanEntry( + id=int(row["id"]), + source_name="_default", + source_start=parse_timestamp(row["source_start"]), + source_end=parse_timestamp(row["source_end"]), + voice_name=None, + )) + return {}, {}, entries + + if not isinstance(data, dict): + raise SystemExit("edit_plan must be a JSON array or object") + if "segments" not in data: + raise SystemExit("Form B plan missing 'segments' field") + + sources_map: dict[str, Path] = {} + for name, p in (data.get("sources") or {}).items(): + sp = Path(p) + if not sp.is_absolute(): + sp = (base / sp).resolve() + sources_map[name] = sp + + voices_map: dict[str, Path] = {} + for name, p in (data.get("voices") or {}).items(): + vp = Path(p) + if not vp.is_absolute(): + vp = (base / vp).resolve() + voices_map[name] = vp + + entries = [] + for row in data["segments"]: + entries.append(PlanEntry( + id=int(row["id"]), + source_name=str(row["source"]), + source_start=parse_timestamp(row["source_start"]), + source_end=parse_timestamp(row["source_end"]), + voice_name=row.get("voice"), + )) + return sources_map, voices_map, entries + + +def validate_plan( + entries: list[PlanEntry], + sources_map: dict[str, Path], + voices_map: dict[str, Path], + legacy_default_source: Path | None, +) -> None: + if not entries: + raise SystemExit("edit_plan has no segments") + seen: set[int] = set() + for e in entries: + if e.id in seen: + raise SystemExit(f"plan duplicate id: {e.id}") + seen.add(e.id) + if e.source_start < 0: + raise SystemExit(f"plan id={e.id}: negative source_start {e.source_start}") + if e.source_end <= e.source_start: + raise SystemExit( + f"plan id={e.id}: source_end {e.source_end:.3f} <= " + f"source_start {e.source_start:.3f}" + ) + if e.source_name == "_default": + if legacy_default_source is None: + raise SystemExit( + "Form A plan requires --source at the CLI" + ) + else: + if e.source_name not in sources_map: + raise SystemExit( + f"plan id={e.id}: source '{e.source_name}' not in sources map" + ) + if e.voice_name is not None and e.voice_name not in voices_map: + raise SystemExit( + f"plan id={e.id}: voice '{e.voice_name}' not in voices map" + ) + for name, sp in sources_map.items(): + if not sp.exists(): + raise SystemExit(f"source '{name}' missing on disk: {sp}") + for name, vp in voices_map.items(): + if not vp.exists(): + raise SystemExit(f"voice '{name}' missing on disk: {vp}") + if legacy_default_source is not None and not legacy_default_source.exists(): + raise SystemExit(f"--source missing on disk: {legacy_default_source}") + + +def validate_alignment(cues: list[SrtCue], entries: list[PlanEntry]) -> None: + cue_ids = {c.id for c in cues} + plan_ids = {e.id for e in entries} + if cue_ids != plan_ids: + only_srt = cue_ids - plan_ids + only_plan = plan_ids - cue_ids + msg = [] + if only_srt: + msg.append(f"in SRT but not in plan: {sorted(only_srt)}") + if only_plan: + msg.append(f"in plan but not in SRT: {sorted(only_plan)}") + raise SystemExit("id mismatch: " + "; ".join(msg)) + + +# ============================================================================ +# Alignment +# ============================================================================ + + +def align( + cues: list[SrtCue], + entries: list[PlanEntry], + sources_map: dict[str, Path], + voices_map: dict[str, Path], + legacy_default_source: Path | None, + tolerance: float, + trim_direction: str, + on_short: str, +) -> list[Segment]: + cue_by_id = {c.id: c for c in cues} + plan_by_id = {e.id: e for e in entries} + + segments: list[Segment] = [] + prev_out_end = 0.0 + for cid in sorted(cue_by_id): + cue = cue_by_id[cid] + pln = plan_by_id[cid] + src_dur = pln.duration + target = cue.duration + + pad_short = False + if src_dur + tolerance < target: + short_by = target - src_dur + if on_short == "error": + raise SystemExit( + f"id={cid}: source is {short_by:.3f}s shorter than SRT target " + f"({src_dur:.3f}s vs {target:.3f}s). Pass --on-short=pad to " + f"freeze-pad the tail, or extend the source range." + ) + pad_short = True + src_start = pln.source_start + src_end = pln.source_end + elif src_dur > target + tolerance: + if trim_direction == "tail": + src_start = pln.source_start + src_end = pln.source_start + target + elif trim_direction == "head": + src_start = pln.source_end - target + src_end = pln.source_end + elif trim_direction == "center": + overhang = (src_dur - target) / 2 + src_start = pln.source_start + overhang + src_end = pln.source_end - overhang + else: + raise ValueError(f"unknown trim_direction: {trim_direction}") + else: + src_start = pln.source_start + src_end = pln.source_start + target + + if pln.source_name == "_default": + assert legacy_default_source is not None + source_path = legacy_default_source + else: + source_path = sources_map[pln.source_name] + + voice_path = voices_map[pln.voice_name] if pln.voice_name else None + gap = max(0.0, cue.final_start - prev_out_end) + segments.append(Segment( + id=cid, + source_path=source_path, + source_start=src_start, + source_end=src_end, + out_start=cue.final_start, + out_end=cue.final_end, + leading_gap=gap, + text=cue.text, + voice_path=voice_path, + pad_short=pad_short, + plan_src_dur=src_dur, + )) + prev_out_end = cue.final_end + + return segments + + +# ============================================================================ +# Style resolution +# ============================================================================ + + +def has_cjk(cues: list[SrtCue]) -> bool: + return any(CJK_RE.search(c.text) for c in cues) + + +def resolve_style(style_arg: str, cues: list[SrtCue]) -> str: + if style_arg == "auto": + return STYLE_TEMPLATES["cjk-natural" if has_cjk(cues) else "bold-uppercase"] + if style_arg in STYLE_TEMPLATES: + return STYLE_TEMPLATES[style_arg] + if "=" in style_arg: + return style_arg + raise SystemExit( + f"unknown style: {style_arg!r}. Known templates: " + f"{sorted(STYLE_TEMPLATES)}. Pass a raw ASS string with '=' to override." + ) + + +# ============================================================================ +# Clip cache +# ============================================================================ + + +def _file_fingerprint(path: Path) -> tuple[int, int]: + st = path.stat() + return (int(st.st_mtime_ns), st.st_size) + + +def cache_key(seg: Segment, effective_bg_volume: float, hdr: bool, + portrait: bool, voice_signature: tuple | None, + ffmpeg_version: str) -> str: + fp = _file_fingerprint(seg.source_path) + payload = json.dumps([ + CACHE_VERSION, + str(seg.source_path.resolve()), fp[0], fp[1], + round(seg.source_start, 4), round(seg.source_end, 4), + round(seg.duration, 4), + round(effective_bg_volume, 4), + hdr, portrait, + seg.pad_short, round(seg.plan_src_dur, 4), + PARAMS_FINGERPRINT, + ffmpeg_version, + voice_signature, + ], sort_keys=True) + return hashlib.sha256(payload.encode()).hexdigest()[:32] + + +def cache_lookup(cache_dir: Path, key: str) -> Path | None: + p = cache_dir / f"{key}.mp4" + return p if p.exists() else None + + +def cache_store(cache_dir: Path, key: str, clip_path: Path) -> None: + cache_dir.mkdir(parents=True, exist_ok=True) + shutil.copy2(clip_path, cache_dir / f"{key}.mp4") + + +# ============================================================================ +# ffmpeg orchestration +# ============================================================================ + + +class PipelineError(SystemExit): + """SystemExit subclass carrying ffmpeg stderr context for diagnostics. + + Batch loops pattern-match on `stderr_tail` to write a richer failure + record. Plain SystemExit raised by pre-flight / validation code keeps + working — callers use `getattr(e, 'stderr_tail', '')` so both branches + of `try/except SystemExit` flow through the same handler. + """ + def __init__(self, message: str, *, stderr_tail: str = ""): + super().__init__(message) + self.stderr_tail = stderr_tail + + +def _tail_text(s: str, *, max_lines: int = 30, max_chars: int = 2000) -> str: + """Return the last `max_lines` of `s`, capped at `max_chars`. + + Used to attach a readable slice of ffmpeg's stderr to PipelineError — + enough to diagnose, not so much that batch summaries balloon. + """ + if not s: + return "" + lines = s.strip().splitlines() + tail = "\n".join(lines[-max_lines:]) + if len(tail) > max_chars: + tail = "...[truncated]...\n" + tail[-(max_chars - 22):] + return tail + + +def run_ff(cmd: list[str], desc: str) -> None: + print(f" $ {desc}") + proc = subprocess.run(cmd, capture_output=True, text=True, + encoding="utf-8", errors="replace") + if proc.returncode != 0: + # Stream raw stderr to the console so an interactive user sees the + # failure live; also attach a bounded tail to the exception so a + # batch summary can capture diagnostic context without keeping the + # full stderr in memory or in the JSON. + sys.stderr.write(proc.stderr or "") + raise PipelineError( + f"ffmpeg failed: {desc}", + stderr_tail=_tail_text(proc.stderr or ""), + ) + + +def probe_duration(path: Path) -> float: + out = subprocess.run( + ["ffprobe", "-v", "error", "-show_entries", "format=duration", + "-of", "default=noprint_wrappers=1:nokey=1", str(path)], + capture_output=True, text=True, check=True, + ) + return float(out.stdout.strip()) + + +def scale_filter_for(source: Path) -> str: + return "scale=-2:1920" if is_portrait_source(source) else "scale=1920:-2" + + +def _voice_signature(voice_path: Path | None, target: float) -> tuple | None: + if voice_path is None: + return None + fp = _file_fingerprint(voice_path) + return (str(voice_path.resolve()), fp[0], fp[1], round(target, 4)) + + +def extract_segment( + seg: Segment, + out_path: Path, + bg_volume: float, +) -> None: + """Extract one segment to 1080p 24fps with audio resolved per-segment. + + `bg_volume` here is the EFFECTIVE level — callers must already have + zeroed it for sources whose ffprobe says there is no audio track. + + Audio resolution: + voice_path present + bg_volume > 0 → mix voice + source*bg + voice_path present + bg_volume == 0 → voice only + voice_path absent + bg_volume > 0 → source audio at bg_volume (fades) + voice_path absent + bg_volume == 0 → silent + """ + keep_audio_from_source = bg_volume > 0.0 + out_path.parent.mkdir(parents=True, exist_ok=True) + target = seg.duration + + vf_parts: list[str] = [] + if is_hdr_source(seg.source_path): + vf_parts.append(TONEMAP_CHAIN) + vf_parts.append(scale_filter_for(seg.source_path)) + + if seg.pad_short and seg.plan_src_dur + 1e-6 < target: + vf_parts.append( + f"tpad=stop_mode=clone:stop_duration={target - seg.plan_src_dur:.3f}" + ) + v_input_dur = seg.plan_src_dur + else: + v_input_dur = target + + vf_parts.append(V_SYNC_TAIL) + vf = ",".join(vf_parts) + + inputs: list[str] = [ + "-ss", f"{seg.source_start:.3f}", + "-i", str(seg.source_path), + "-t", f"{v_input_dur:.3f}", + ] + + has_voice = seg.voice_path is not None + voice_index: int | None = None + if has_voice: + voice_index = 1 + inputs += ["-i", str(seg.voice_path)] + + # Audio filter graph — applied via -filter_complex when we have voice, + # otherwise simple -af on source audio. + audio_args: list[str] = [] + if has_voice and bg_volume <= 0.0: + fade_out = max(0.0, target - 0.03) + ac_parts = [ + f"[{voice_index}:a]apad=whole_dur={target:.3f}," + f"atrim=duration={target:.3f},asetpts=PTS-STARTPTS," + f"afade=t=in:st=0:d=0.03," + f"afade=t=out:st={fade_out:.3f}:d=0.03," + f"{A_SYNC_TAIL}[outa]" + ] + audio_args = ["-filter_complex", ";".join(ac_parts), + "-map", "[outa]"] + elif has_voice and bg_volume > 0.0: + fade_out = max(0.0, target - 0.03) + ac_parts = [ + f"[{voice_index}:a]apad=whole_dur={target:.3f}," + f"atrim=duration={target:.3f},asetpts=PTS-STARTPTS[voice]", + f"[0:a]volume={bg_volume:.3f}," + f"afade=t=in:st=0:d=0.03,afade=t=out:st={fade_out:.3f}:d=0.03[bg]", + f"[voice][bg]amix=inputs=2:duration=first:normalize=0," + f"{A_SYNC_TAIL}[outa]", + ] + audio_args = ["-filter_complex", ";".join(ac_parts), + "-map", "[outa]"] + elif not has_voice and keep_audio_from_source: + fade_out = max(0.0, target - 0.03) + af = ( + f"volume={bg_volume:.3f}," + f"afade=t=in:st=0:d=0.03,afade=t=out:st={fade_out:.3f}:d=0.03," + f"{A_SYNC_TAIL}" + ) + if seg.pad_short and seg.plan_src_dur + 1e-6 < target: + af = f"apad=whole_dur={target:.3f},{af}" + audio_args = ["-af", af, "-map", "0:a"] + else: + # silent track via lavfi so concat inputs share an audio stream + inputs += [ + "-f", "lavfi", "-t", f"{target:.3f}", + "-i", f"anullsrc=channel_layout=stereo:sample_rate={SAMPLE_RATE}", + ] + silent_idx = 2 if has_voice else 1 + audio_args = ["-af", A_SYNC_TAIL, "-map", f"{silent_idx}:a"] + + cmd: list[str] = [ + "ffmpeg", "-y", "-hide_banner", "-nostats", + *inputs, + "-vf", vf, "-r", str(FPS), + "-map", "0:v", + *audio_args, + "-c:v", "libx264", "-preset", "fast", "-crf", "20", + "-pix_fmt", "yuv420p", + "-c:a", "aac", "-b:a", AUDIO_BITRATE, "-ar", str(SAMPLE_RATE), "-ac", "2", + "-t", f"{target:.3f}", + "-movflags", "+faststart", + str(out_path), + ] + run_ff(cmd, f"extract id={seg.id} src[{seg.source_start:.2f}-{seg.source_end:.2f}] → {out_path.name}") + + +def make_gap_clip(duration: float, portrait: bool, out_path: Path) -> None: + out_path.parent.mkdir(parents=True, exist_ok=True) + size = "1080x1920" if portrait else "1920x1080" + cmd = [ + "ffmpeg", "-y", "-hide_banner", "-nostats", + "-f", "lavfi", "-i", f"color=c=black:s={size}:r={FPS}:d={duration:.3f}", + "-f", "lavfi", "-i", + f"anullsrc=channel_layout=stereo:sample_rate={SAMPLE_RATE}", + "-t", f"{duration:.3f}", + "-vf", V_SYNC_TAIL, + "-af", A_SYNC_TAIL, + "-c:v", "libx264", "-preset", "fast", "-crf", "20", + "-pix_fmt", "yuv420p", "-r", str(FPS), + "-c:a", "aac", "-b:a", AUDIO_BITRATE, "-ar", str(SAMPLE_RATE), + "-movflags", "+faststart", + str(out_path), + ] + run_ff(cmd, f"gap {duration:.3f}s → {out_path.name}") + + +def concat_clips(clip_paths: list[Path], out_path: Path, work_dir: Path) -> None: + """Concat losslessly via the demuxer. work_dir is assumed safe-ASCII. + + Each line is `file ` with the quoting routine that handles + spaces, single quotes, and CJK. Callers should register the list file + for cleanup BEFORE this is invoked so a mid-write failure still cleans up. + """ + concat_list = work_dir / "_concat_srt_driven.txt" + lines = [f"file {concat_quote_path(p)}\n" for p in clip_paths] + concat_list.write_text("".join(lines), encoding="utf-8") + cmd = [ + "ffmpeg", "-y", "-hide_banner", "-nostats", + "-f", "concat", "-safe", "0", + "-i", str(concat_list), + "-c", "copy", + "-movflags", "+faststart", + str(out_path), + ] + run_ff(cmd, f"concat {len(clip_paths)} clips → {out_path.name}") + + +def burn_subtitles( + base_path: Path, + subs_path: Path, + style: str, + fontsdir: Path | None, + out_path: Path, + *, + global_voice: Path | None = None, + total_duration: float = 0.0, +) -> None: + """Final pass: optional global-voice mix + subtitle burn (LAST). + + Self-defending on subs_path: if not filter-safe, copied to a deterministic + temp SRT first so libavfilter never sees the problematic original. + fontsdir, if given, must already be filter-safe — we error rather than + copy an entire font directory. + + Audio handling: + - global_voice is None: pass base audio through (`-c:a copy`). + - global_voice given: voice is apad'd / atrim'd to exactly total_duration + so it spans the entire output timeline, then mixed on top of base's + audio. Base already contains source*bg_volume (or silence) from + extract_segment, so we do NOT re-scale it here — that would double- + attenuate the background. amix uses duration=first so the result + runs exactly total_duration; normalize=0 keeps levels predictable. + """ + if fontsdir is not None and not _path_is_filter_safe(fontsdir): + raise SystemExit( + f"fontsdir contains non-ASCII or single-quote characters; " + f"move it to a safe ASCII path first: {fontsdir}" + ) + + safe_subs, cleanup_target = ensure_safe_subs_path(subs_path) + try: + subs_arg = subs_filter_escape(safe_subs) + style_escaped = style.replace("'", r"\'") + if fontsdir is not None: + fd = subs_filter_escape(fontsdir) + subs_filter = f"subtitles='{subs_arg}':fontsdir='{fd}':force_style='{style_escaped}'" + else: + subs_filter = f"subtitles='{subs_arg}':force_style='{style_escaped}'" + + if global_voice is None: + # No audio work — just burn subtitles, copy audio. + cmd = [ + "ffmpeg", "-y", "-hide_banner", "-nostats", + "-i", str(base_path), + "-vf", subs_filter, + "-c:v", "libx264", "-preset", "fast", "-crf", "18", + "-pix_fmt", "yuv420p", + "-c:a", "copy", + "-movflags", "+faststart", + str(out_path), + ] + label = f"subtitle burn (LAST) → {out_path.name}" + else: + if total_duration <= 0.0: + raise SystemExit( + "burn_subtitles: total_duration must be > 0 when global_voice is set" + ) + voice_chain = ( + f"[1:a]apad=whole_dur={total_duration:.3f}," + f"atrim=duration={total_duration:.3f}," + f"asetpts=PTS-STARTPTS," + f"{A_SYNC_TAIL}" + ) + # base [0:a] already contains source*bg_volume from extract; do NOT + # apply bg_volume again here. amix combines voice + existing base + # audio (which is silent on gaps and on segments with bg_volume=0). + filter_complex = ( + f"[0:v]{subs_filter}[outv];" + f"{voice_chain}[voice];" + f"[voice][0:a]amix=inputs=2:duration=first:normalize=0[outa]" + ) + cmd = [ + "ffmpeg", "-y", "-hide_banner", "-nostats", + "-i", str(base_path), + "-i", str(global_voice), + "-filter_complex", filter_complex, + "-map", "[outv]", "-map", "[outa]", + "-c:v", "libx264", "-preset", "fast", "-crf", "18", + "-pix_fmt", "yuv420p", + "-c:a", "aac", "-b:a", AUDIO_BITRATE, "-ar", str(SAMPLE_RATE), + "-movflags", "+faststart", + str(out_path), + ] + label = f"subtitle burn (LAST) + global voice mix → {out_path.name}" + + run_ff(cmd, label) + finally: + if cleanup_target is not None: + try: + cleanup_target.unlink() + except OSError: + pass + + +# ============================================================================ +# EDL + QC artifacts +# ============================================================================ + + +def write_edl(segments: list[Segment], srt: Path, plan: Path, + bg_volume: float, style_name: str, out_path: Path) -> None: + edl = { + "version": "srt-driven-2", + "script_srt": str(srt.resolve()), + "plan": str(plan.resolve()), + "bg_volume": bg_volume, + "style": style_name, + "segments": [ + { + "id": s.id, + "source": str(s.source_path.resolve()), + "source_start": format_srt_ts(s.source_start), + "source_end": format_srt_ts(s.source_end), + "out_start": format_srt_ts(s.out_start), + "out_end": format_srt_ts(s.out_end), + "duration": round(s.duration, 3), + "leading_gap": round(s.leading_gap, 3), + "voice": str(s.voice_path.resolve()) if s.voice_path else None, + "pad_short": s.pad_short, + "text": s.text, + } + for s in segments + ], + "total_duration_s": round(segments[-1].out_end, 3) if segments else 0.0, + } + out_path.write_text(json.dumps(edl, indent=2, ensure_ascii=False), encoding="utf-8") + print(f" EDL → {out_path.name}") + + +def _dir_size(path: Path) -> int: + if not path.exists(): + return 0 + total = 0 + for p in path.rglob("*"): + if p.is_file(): + total += p.stat().st_size + return total + + +def build_qc_report( + job_name: str, + segments: list[Segment], + seg_clip_info: list[dict], + output_path: Path, + expected_duration: float, + style_name: str, + style_resolved: str, + bg_volume: float, + has_any_voice: bool, + elapsed_s: float, + edit_dir: Path, + work_dir: Path, + cache_dir: Path, + out_qc_path: Path, +) -> dict: + actual_dur = probe_duration(output_path) + drift_ms = round((actual_dur - expected_duration) * 1000) + + audio_mode = ( + "voice_replace" if has_any_voice and bg_volume <= 0.0 + else "voice_mix" if has_any_voice + else "original_only" if bg_volume > 0.0 + else "silent" + ) + + seg_records = [] + for seg, info in zip(segments, seg_clip_info): + actual_seg = probe_duration(info["clip_path"]) if Path(info["clip_path"]).exists() else 0.0 + seg_records.append({ + "id": seg.id, + "expected_duration_s": round(seg.duration, 3), + "actual_duration_s": round(actual_seg, 3), + "drift_ms": round((actual_seg - seg.duration) * 1000), + "cached": info["cached"], + "clip_size_bytes": Path(info["clip_path"]).stat().st_size if Path(info["clip_path"]).exists() else 0, + "source": str(seg.source_path), + "voice": str(seg.voice_path) if seg.voice_path else None, + }) + + clips_size = sum(s["clip_size_bytes"] for s in seg_records) + final_size = output_path.stat().st_size + cache_size = _dir_size(cache_dir) + work_dir_size = _dir_size(work_dir) + + report = { + "job": job_name, + "ok": abs(actual_dur - expected_duration) <= DURATION_DRIFT_TOLERANCE_S, + "elapsed_s": round(elapsed_s, 2), + "duration": { + "expected_s": round(expected_duration, 3), + "actual_s": round(actual_dur, 3), + "drift_ms": drift_ms, + "tolerance_ms": int(DURATION_DRIFT_TOLERANCE_S * 1000), + "within_tolerance": abs(actual_dur - expected_duration) <= DURATION_DRIFT_TOLERANCE_S, + }, + "segments": seg_records, + "subtitles": { + "applied": True, + "style_name": style_name, + "force_style": style_resolved, + "cue_count": len(segments), + }, + "audio": { + "mode": audio_mode, + "bg_volume": bg_volume, + "voice_used": has_any_voice, + }, + "disk_usage_bytes": { + "work_dir_total": work_dir_size, + "clips_in_work_dir": clips_size, + "final_output": final_size, + "cache": cache_size, + }, + "output_path": str(output_path), + } + out_qc_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8") + print(f" QC report → {out_qc_path.name}") + return report + + +# ============================================================================ +# Single-job runner +# ============================================================================ + + +@dataclass +class Job: + source: Path | None # legacy single-source path; None if Form B + srt: Path + plan: Path + voice: Path | None # global voice override (mutually exclusive with per-segment) + bg_volume: float + tolerance: float + trim_direction: str + on_short: str + style: str + fontsdir: Path | None + output: Path | None + name: str + no_cache: bool + keep_intermediates: bool + no_overwrite: bool = False + mode: str = "full" # "full" (default) | "extract" (stop after segments) + + +def run_job(job: Job, ffmpeg_version: str) -> dict: + t0 = time.time() + print(f"\n== job: {job.name} ==") + + cues = parse_srt(job.srt) + validate_srt(cues) + sources_map, voices_map, entries = parse_plan(job.plan) + + legacy_source: Path | None = None + if sources_map: + if job.source is not None: + print(" note: --source ignored (plan defines its own sources)") + else: + if job.source is None: + raise SystemExit("Form A plan needs --source ") + legacy_source = job.source.resolve() + + has_per_seg_voice = any(e.voice_name for e in entries) + if job.voice is not None and has_per_seg_voice: + raise SystemExit( + "voice conflict: --voice given AND plan contains per-segment voices. " + "Pick one." + ) + + # Global voice is NOT expanded into per-segment entries. Per-segment voices + # play during their segment's window; a global voice spans the entire + # output timeline and is mixed in during the final compose step. Doing it + # at extract time would replay voice[0:seg_dur] for every segment, which + # is wrong for any voice longer than one segment. + global_voice: Path | None = job.voice + if global_voice is not None: + v_info = probe_streams(global_voice) + if not v_info["has_audio"]: + raise SystemExit(f"global --voice file has no audio track: {global_voice}") + print(f" global voice: {global_voice.name} ({v_info['duration']:.3f}s)") + + validate_plan(entries, sources_map, voices_map, legacy_source) + validate_alignment(cues, entries) + + # Probe every source once. Cache by Path to avoid repeat ffprobe calls + # when many segments share a source. + unique_sources: dict[str, Path] = {} + if legacy_source is not None: + unique_sources["_default"] = legacy_source + for name, p in sources_map.items(): + unique_sources[name] = p + + source_info: dict[str, dict] = {} + source_info_by_path: dict[Path, dict] = {} + print(" probing sources:") + for name, p in unique_sources.items(): + info = probe_streams(p) + source_info[name] = info + source_info_by_path[p] = info + print(f" {name}: video={info['has_video']} audio={info['has_audio']} " + f"duration={info['duration']:.3f}s") + if not info["has_video"]: + raise SystemExit(f"source '{name}' has no video stream: {p}") + + # Range bounds — fail fast rather than letting ffmpeg fail mid-batch. + for e in entries: + info = source_info[e.source_name] + if e.source_end > info["duration"] + job.tolerance: + raise SystemExit( + f"plan id={e.id}: source_end {e.source_end:.3f}s exceeds " + f"source '{e.source_name}' duration {info['duration']:.3f}s " + f"(tolerance ±{job.tolerance}s)" + ) + + # Effective bg_volume per source: if source has no audio track, force to 0 + # rather than letting ffmpeg fail on a missing 0:a stream reference. + no_audio_names = [n for n, info in source_info.items() if not info["has_audio"]] + if no_audio_names and job.bg_volume > 0.0: + print(f" WARNING: source(s) {no_audio_names} have no audio track — " + f"bg_volume forced to 0 for segments from them") + + segments = align( + cues, entries, sources_map, voices_map, legacy_source, + tolerance=job.tolerance, trim_direction=job.trim_direction, + on_short=job.on_short, + ) + + edit_dir = (job.output.parent if job.output else job.plan.parent / "edit") + edit_dir.mkdir(parents=True, exist_ok=True) + out_path = job.output.resolve() if job.output else ( + edit_dir / f"final_srt_driven_{safe_ascii_name(job.name)}.mp4" + ) + + # Output-overwrite check only matters in modes that actually produce + # final output. Extract mode stops before any out_path is written, so + # checking it would produce spurious warnings about an unrelated file. + if job.mode == "full" and out_path.exists(): + if job.no_overwrite: + raise SystemExit(f"output exists and --no-overwrite set: {out_path}") + print(f" WARNING: overwriting existing output: {out_path}") + + style_resolved = resolve_style(job.style, cues) + print(f" style: {job.style} ({len(cues)} cues, cjk={has_cjk(cues)}) mode={job.mode}") + + # All intermediates live in a safe-ASCII temp dir under tempfile.gettempdir(). + # Wiped at start so a previous crashed run cannot pollute. Wiped at end + # (in finally) unless --keep-intermediates is set. + work_dir = make_safe_work_dir(job.name, job.plan) + print(f" work dir: {work_dir}") + + try: + # SRT normalized to UTF-8 with encoding fallback (handles GB18030 input). + # Lives in the safe work dir so its path is guaranteed friendly to libass. + safe_subs = work_dir / "subs.srt" + safe_subs.write_text(read_srt_text(job.srt), encoding="utf-8") + + edl_path = edit_dir / f"edl_srt_driven_{safe_ascii_name(job.name)}.json" + write_edl(segments, job.srt, job.plan, job.bg_volume, job.style, edl_path) + + clips_dir = work_dir / "clips" + clips_dir.mkdir(parents=True, exist_ok=True) + cache_dir = edit_dir / "cache_srt_driven" + + portrait = is_portrait_source(segments[0].source_path) + + clip_paths: list[Path] = [] + seg_clip_info: list[dict] = [] + any_voice = any(s.voice_path is not None for s in segments) + + print(f"\n extracting {len(segments)} segments cache={'off' if job.no_cache else 'on'} voice={'per-seg' if any_voice else 'none'}") + for i, seg in enumerate(segments): + # Gap clips are a concat-time concept (synthetic black + silence + # bridging non-contiguous SRT cues). Extract mode emits only the + # real source segments, so skip gap clips entirely there. + if job.mode != "extract" and seg.leading_gap > 0.001: + gap_path = clips_dir / f"gap_{i:02d}_{seg.leading_gap:.3f}.mp4" + if not gap_path.exists(): + make_gap_clip(seg.leading_gap, portrait, gap_path) + clip_paths.append(gap_path) + + seg_path = clips_dir / f"seg_{i:02d}_id{seg.id}.mp4" + voice_sig = _voice_signature(seg.voice_path, seg.duration) + + # Effective bg_volume for THIS segment: forced to 0 if its source + # has no audio track. Keeps ffmpeg from referencing a missing 0:a. + src_has_audio = source_info_by_path[seg.source_path]["has_audio"] + effective_bg = job.bg_volume if src_has_audio else 0.0 + + ck = cache_key( + seg, + effective_bg_volume=effective_bg, + hdr=is_hdr_source(seg.source_path), + portrait=portrait, + voice_signature=voice_sig, + ffmpeg_version=ffmpeg_version, + ) if not job.no_cache else None + + cached_hit = False + if ck and (hit := cache_lookup(cache_dir, ck)) is not None: + shutil.copy2(hit, seg_path) + print(f" [cache hit] id={seg.id} → {seg_path.name}") + cached_hit = True + else: + extract_segment(seg, seg_path, bg_volume=effective_bg) + if ck: + cache_store(cache_dir, ck, seg_path) + + clip_paths.append(seg_path) + seg_clip_info.append({"clip_path": str(seg_path), "cached": cached_hit}) + + # ---- Extract mode: copy clips to a persistent location and stop ---- + if job.mode == "extract": + extracted_dir = edit_dir / f"extracted_clips_{safe_ascii_name(job.name)}" + extracted_dir.mkdir(parents=True, exist_ok=True) + # Wipe stale clips from a prior run so the dir reflects only this + # run's segments — same pattern as srt_video_editor.py. + for stale in extracted_dir.glob("clip_*.mp4"): + stale.unlink() + copied: list[dict] = [] + for seg, info in zip(segments, seg_clip_info): + src = Path(info["clip_path"]) + if not src.exists(): + continue + dst = extracted_dir / f"clip_{seg.id:03d}.mp4" + shutil.copy2(src, dst) + copied.append({ + "id": seg.id, + "filename": dst.name, + "expected_duration_s": round(seg.duration, 3), + "cached_from_prev_run": info["cached"], + }) + print(f"\n=== extract mode: stopping after segment extraction ===") + print(f" {len(copied)} clip(s) saved to: {extracted_dir}/") + for c in copied: + print(f" {c['filename']:<24} " + f"({c['expected_duration_s']:.3f}s)" + + (" [cache hit]" if c["cached_from_prev_run"] else "")) + return { + "job": job.name, + "ok": True, + "mode": "extract", + "extracted_dir": str(extracted_dir), + "clip_count": len(copied), + "segments": copied, + "elapsed_s": round(time.time() - t0, 2), + } + # ---- Full mode continues to concat + compose ---- + + base_path = work_dir / "base.mp4" + concat_clips(clip_paths, base_path, work_dir) + + total_duration = segments[-1].out_end + burn_subtitles( + base_path, safe_subs, style_resolved, job.fontsdir, out_path, + global_voice=global_voice, + total_duration=total_duration, + ) + + # QC voice flag must reflect EITHER per-segment OR global voice usage. + voice_used = any_voice or (global_voice is not None) + + qc_path = edit_dir / f"qc_report_{safe_ascii_name(job.name)}.json" + qc_report = build_qc_report( + job_name=job.name, + segments=segments, + seg_clip_info=seg_clip_info, + output_path=out_path, + expected_duration=total_duration, + style_name=job.style, + style_resolved=style_resolved, + bg_volume=job.bg_volume, + has_any_voice=voice_used, + elapsed_s=time.time() - t0, + edit_dir=edit_dir, + work_dir=work_dir, + cache_dir=cache_dir, + out_qc_path=qc_path, + ) + print(f"\n done in {qc_report['elapsed_s']}s, drift={qc_report['duration']['drift_ms']}ms") + return qc_report + + finally: + if job.keep_intermediates: + print(f" intermediates kept at: {work_dir}") + else: + shutil.rmtree(work_dir, ignore_errors=True) + + +# ============================================================================ +# Batch manifest +# ============================================================================ + + +def make_failure_record( + *, + index: int, + name: str, + error: BaseException, + job: "Job | None" = None, + manifest_row: dict | None = None, +) -> dict: + """Build a diagnostic failure entry for a batch summary. + + Shape: `{job, ok=False, index, error, stderr_tail, srt, plan, source, output}`. + + `stderr_tail` is non-empty only for `PipelineError` (i.e. ffmpeg failures); + plain `SystemExit` from validation paths leaves it as "". When `job` is + provided, paths come from the resolved Job; otherwise they fall back to + the raw manifest_row dict so rows that crash inside `job_from_dict` + still get useful context. + """ + stderr_tail = "" + if isinstance(error, PipelineError): + stderr_tail = error.stderr_tail or "" + + if job is not None: + srt = str(job.srt) if job.srt else None + plan = str(job.plan) if job.plan else None + source = str(job.source) if job.source else None + output = str(job.output) if job.output else None + elif manifest_row is not None: + srt = manifest_row.get("srt") + plan = manifest_row.get("plan") + source = manifest_row.get("source") + output = manifest_row.get("output") + else: + srt = plan = source = output = None + + return { + "job": name, + "ok": False, + "index": index, + "error": str(error), + "stderr_tail": stderr_tail, + "srt": srt, + "plan": plan, + "source": source, + "output": output, + } + + +def load_manifest(path: Path) -> list[dict]: + suffix = path.suffix.lower() + if suffix == ".json": + try: + raw = path.read_text(encoding="utf-8") + except OSError as e: + raise SystemExit(f"batch manifest unreadable: {path}: {e}") + try: + data = json.loads(raw) + except json.JSONDecodeError as e: + raise SystemExit( + f"batch manifest is not valid JSON: {path}: " + f"line {e.lineno} col {e.colno}: {e.msg}" + ) + if not isinstance(data, list): + raise SystemExit("batch manifest JSON must be an array of job dicts") + return data + if suffix == ".csv": + rows: list[dict] = [] + try: + with path.open(newline="", encoding="utf-8-sig") as f: + for row in csv.DictReader(f): + rows.append({k: v for k, v in row.items() if v != ""}) + except (OSError, csv.Error) as e: + raise SystemExit(f"batch manifest CSV error: {path}: {e}") + return rows + raise SystemExit(f"unsupported manifest format: {suffix}") + + +def job_from_dict(d: dict, defaults: argparse.Namespace, manifest_dir: Path, + idx: int) -> Job: + def _path(key: str) -> Path | None: + v = d.get(key) + if v in (None, ""): + return None + p = Path(v) + return p if p.is_absolute() else (manifest_dir / p).resolve() + + def _float(key: str, fb: float) -> float: + v = d.get(key) + return float(v) if v not in (None, "") else fb + + def _str(key: str, fb: str) -> str: + v = d.get(key) + return str(v) if v not in (None, "") else fb + + def _bool(key: str, fb: bool) -> bool: + v = d.get(key) + if isinstance(v, bool): + return v + if v in (None, ""): + return fb + return str(v).lower() in ("1", "true", "yes", "on") + + srt_path = _path("srt") + plan_path = _path("plan") + if srt_path is None: + raise SystemExit(f"manifest row {idx}: missing srt") + if plan_path is None: + raise SystemExit(f"manifest row {idx}: missing plan") + + job_name = _str("name", plan_path.stem) + explicit_output = _path("output") + if explicit_output is None: + # Auto-isolate outputs by index so two jobs with the same name never + # silently overwrite each other. + explicit_output = ( + manifest_dir / f"final_srt_driven_{safe_ascii_name(job_name)}_{idx:02d}.mp4" + ) + + row_mode = _str("mode", getattr(defaults, "mode", "full")) + if row_mode not in ("full", "extract"): + raise SystemExit( + f"manifest row {idx}: invalid mode {row_mode!r}; " + "expected 'full' or 'extract'" + ) + + return Job( + source=_path("source"), + srt=srt_path, + plan=plan_path, + voice=_path("voice"), + bg_volume=_float("bg_volume", defaults.bg_volume), + tolerance=_float("tolerance", defaults.tolerance), + trim_direction=_str("trim_direction", defaults.trim_direction), + on_short=_str("on_short", defaults.on_short), + style=_str("style", defaults.style), + fontsdir=_path("fontsdir"), + output=explicit_output, + name=job_name, + no_cache=_bool("no_cache", defaults.no_cache), + keep_intermediates=_bool("keep_intermediates", defaults.keep_intermediates), + no_overwrite=_bool("no_overwrite", defaults.no_overwrite), + mode=row_mode, + ) + + +# ============================================================================ +# CLI +# ============================================================================ + + +def main() -> None: + ap = argparse.ArgumentParser(description="SRT-driven edit assembly") + ap.add_argument("--source", type=Path, default=None, + help="Form A: single source.mp4. Ignored if plan declares sources.") + ap.add_argument("--srt", type=Path, default=None, help="script.srt") + ap.add_argument("--plan", type=Path, default=None, help="edit_plan.json (Form A or B)") + ap.add_argument("--voice", type=Path, default=None, + help="Global voice.wav spanning the whole timeline. " + "Mutually exclusive with per-segment voices in the plan.") + ap.add_argument("--bg-volume", type=float, default=0.0, + help="original audio level (0.0=mute, 0.1=10%%). Default 0.0.") + ap.add_argument("--tolerance", type=float, default=0.5, + help="seconds. |source_dur - srt_dur| > tolerance triggers trim/error.") + ap.add_argument("--trim-direction", choices=["tail", "head", "center"], default="tail") + ap.add_argument("--on-short", choices=["error", "pad"], default="error") + ap.add_argument("--style", default="auto", + help=f"subtitle style. Templates: {sorted(STYLE_TEMPLATES)}. " + "'auto' picks cjk-natural if SRT has CJK, else bold-uppercase. " + "Pass a raw ASS string containing '=' to override.") + ap.add_argument("--fontsdir", type=Path, default=None, + help="extra fonts directory passed to libass.") + ap.add_argument("-o", "--output", type=Path, default=None) + ap.add_argument( + "--mode", choices=["full", "extract"], default="full", + help="'full' (default) runs extract -> concat -> subtitle burn. " + "'extract' stops after segment extraction and saves per-cue " + "clips to /extracted_clips_/clip_.mp4; " + "gap clips, voice mixing, subtitle burn, and QC report are " + "skipped.", + ) + ap.add_argument("--no-cache", action="store_true") + ap.add_argument("--no-overwrite", action="store_true", + help="refuse to run if output file already exists.") + ap.add_argument("--keep-intermediates", action="store_true", + help="keep the temp work dir (clips, base, concat list) after rendering.") + ap.add_argument("--batch", type=Path, default=None, + help="run a batch manifest (jobs.json or jobs.csv) instead.") + ap.add_argument("--continue-on-error", action="store_true", + help="when --batch: skip failing jobs instead of aborting.") + args = ap.parse_args() + + versions = preflight() + print(f"== preflight: ffmpeg {versions['ffmpeg']} / ffprobe {versions['ffprobe']} ==") + + if args.batch is not None: + manifest_path = args.batch.resolve() + rows = load_manifest(manifest_path) + results: list[dict] = [] + for i, row in enumerate(rows): + try: + job = job_from_dict(row, args, manifest_path.parent, i) + except (SystemExit, Exception) as e: + if args.continue_on_error: + print(f"[batch {i}] skipped: {type(e).__name__}: {e}") + results.append(make_failure_record( + index=i, name=row.get("name", f"row{i}"), + error=e, job=None, manifest_row=row, + )) + continue + raise + try: + results.append(run_job(job, versions["ffmpeg"])) + except (SystemExit, Exception) as e: + if args.continue_on_error: + print(f"[batch {i}] FAILED: {type(e).__name__}: {e}") + results.append(make_failure_record( + index=i, name=job.name, error=e, job=job, + )) + continue + raise + summary_path = manifest_path.with_name(manifest_path.stem + "_qc_summary.json") + summary_path.write_text( + json.dumps({"jobs": results, "total": len(results), + "ok": sum(1 for r in results if r.get("ok"))}, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + print(f"\nbatch QC summary → {summary_path}") + ok = sum(1 for r in results if r.get("ok")) + print(f" {ok}/{len(results)} jobs ok") + return + + if args.srt is None or args.plan is None: + ap.error("--srt and --plan required (or use --batch)") + + job = Job( + source=args.source.resolve() if args.source else None, + srt=args.srt.resolve(), + plan=args.plan.resolve(), + voice=args.voice.resolve() if args.voice else None, + bg_volume=args.bg_volume, + tolerance=args.tolerance, + trim_direction=args.trim_direction, + on_short=args.on_short, + style=args.style, + fontsdir=args.fontsdir.resolve() if args.fontsdir else None, + output=args.output.resolve() if args.output else None, + name=args.plan.stem, + no_cache=args.no_cache, + keep_intermediates=args.keep_intermediates, + no_overwrite=args.no_overwrite, + mode=args.mode, + ) + run_job(job, versions["ffmpeg"]) + + +if __name__ == "__main__": + main() diff --git a/helpers/transcribe.py b/helpers/transcribe.py index 26d3906..a8b28ac 100644 --- a/helpers/transcribe.py +++ b/helpers/transcribe.py @@ -1,16 +1,37 @@ -"""Transcribe a video with ElevenLabs Scribe. +"""Transcribe a video with Alibaba DashScope Paraformer-v2 (realtime, file mode). + +Extracts mono 16kHz PCM audio via ffmpeg, streams it to DashScope's +paraformer-realtime-v2 model via the official `dashscope` SDK, and +writes a Scribe-compatible JSON transcript so the downstream +recommend_edit_plan helper keeps working without changes. + +Output schema (intentionally Scribe-shaped): + { + "language_code": "auto" | "", + "_source": "dashscope-paraformer-realtime-v2", + "words": [ + {"text": "你好", "start": 1.234, "end": 1.567, "type": "word"}, + ... + ] + } + +Tradeoffs vs the previous ElevenLabs Scribe integration: + - No speaker diarization — paraformer does not segment speakers, + so `speaker_id` is omitted from every word record. + - No audio events — Scribe's "(laughter)" / "(applause)" entries + with `"type": "audio_event"` are simply absent. + - The `--num-speakers` flag is accepted by transcribe_one for + backward compatibility with transcribe_batch but ignored. -Extracts mono 16kHz audio via ffmpeg, uploads to Scribe with verbatim + -diarize + audio events + word-level timestamps, writes the full response -to /transcripts/.json. +Cached: if the output transcript already exists, the API call is skipped. -Cached: if the output file already exists, the upload is skipped. +API key: + DASHSCOPE_API_KEY in /.env or in the environment. Usage: python helpers/transcribe.py + python helpers/transcribe.py --language zh python helpers/transcribe.py --edit-dir /custom/edit - python helpers/transcribe.py --language en - python helpers/transcribe.py --num-speakers 2 """ from __future__ import annotations @@ -24,29 +45,34 @@ import time from pathlib import Path -import requests - -SCRIBE_URL = "https://api.elevenlabs.io/v1/speech-to-text" +DASHSCOPE_MODEL = "paraformer-realtime-v2" +ENV_VAR = "DASHSCOPE_API_KEY" def load_api_key() -> str: + """Read DASHSCOPE_API_KEY from /.env, ./.env, or the environment.""" for candidate in [Path(__file__).resolve().parent.parent / ".env", Path(".env")]: if candidate.exists(): - for line in candidate.read_text().splitlines(): + for line in candidate.read_text(encoding="utf-8").splitlines(): line = line.strip() if not line or line.startswith("#") or "=" not in line: continue k, v = line.split("=", 1) - if k.strip() == "ELEVENLABS_API_KEY": + if k.strip() == ENV_VAR: return v.strip().strip('"').strip("'") - v = os.environ.get("ELEVENLABS_API_KEY", "") + v = os.environ.get(ENV_VAR, "") if not v: - sys.exit("ELEVENLABS_API_KEY not found in .env or environment") + sys.exit( + f"{ENV_VAR} not found in .env or environment. " + f"Generate one at https://dashscope.console.aliyun.com/ " + f"and put `{ENV_VAR}=...` in /.env." + ) return v def extract_audio(video_path: Path, dest: Path) -> None: + """Extract mono 16kHz PCM WAV — the format paraformer-v2 expects.""" cmd = [ "ffmpeg", "-y", "-i", str(video_path), "-vn", "-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", @@ -55,36 +81,109 @@ def extract_audio(video_path: Path, dest: Path) -> None: subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) -def call_scribe( +def _convert_dashscope_to_scribe( + sentences: list[dict], + language_hint: str | None, +) -> dict: + """Flatten DashScope sentence/word structure into Scribe-compatible shape. + + DashScope returns: + sentence: [ + {begin_time, end_time, text, + words: [{begin_time, end_time, text, punctuation}, ...]} + ] + + recommend_edit_plan.load_transcript_words wants a flat words[] with + seconds-based start/end and a 'word' type marker. Convert here so the + consumer stays Scribe-shaped and we don't need to touch recommender code. + + Punctuation tokens that DashScope splits onto their own word entry are + folded into the preceding word's text — closer to how Scribe formatted + them. Empty / whitespace-only text entries are dropped. + """ + words: list[dict] = [] + for sent in sentences or []: + for w in (sent.get("words") or []): + text = (w.get("text") or "").strip() + if not text: + continue + punct = (w.get("punctuation") or "").strip() + try: + start_ms = float(w.get("begin_time") or 0) + end_ms = float(w.get("end_time") or 0) + except (TypeError, ValueError): + continue + words.append({ + "text": text + punct, + "start": start_ms / 1000.0, + "end": end_ms / 1000.0, + "type": "word", + }) + return { + "language_code": language_hint or "auto", + "_source": f"dashscope-{DASHSCOPE_MODEL}", + "words": words, + } + + +def call_dashscope( audio_path: Path, api_key: str, language: str | None = None, - num_speakers: int | None = None, ) -> dict: - data: dict[str, str] = { - "model_id": "scribe_v1", - "diarize": "true", - "tag_audio_events": "true", - "timestamps_granularity": "word", - } - if language: - data["language_code"] = language - if num_speakers: - data["num_speakers"] = str(num_speakers) - - with open(audio_path, "rb") as f: - resp = requests.post( - SCRIBE_URL, - headers={"xi-api-key": api_key}, - files={"file": (audio_path.name, f, "audio/wav")}, - data=data, - timeout=1800, + """Call paraformer-realtime-v2 in file mode. Returns Scribe-shaped dict. + + The dashscope SDK handles WebSocket framing internally when given a + local file path — no manual chunking required. Defensive against + minor SDK shape variations: tolerates both `output.sentence` and + `output.sentences` (the docs and the wire format have shifted). + """ + try: + import dashscope + from dashscope.audio.asr import Recognition + except ImportError: + raise SystemExit( + "dashscope package not installed. Install with:\n" + " pip install dashscope\n" + "(or `pip install -e .` from the repo root once dashscope is in " + "your project deps)." ) - if resp.status_code != 200: - raise RuntimeError(f"Scribe returned {resp.status_code}: {resp.text[:500]}") + dashscope.api_key = api_key + # Pin to the Mainland China endpoints explicitly. Both URLs are the SDK + # defaults, but stale DASHSCOPE_HTTP_BASE_URL / DASHSCOPE_WEBSOCKET_BASE_URL + # env vars (left over from an international account) would otherwise route + # us to the wrong region and produce a misleading 401 from the intl host + # even when the key is valid on the domestic side. + dashscope.base_http_api_url = "https://dashscope.aliyuncs.com/api/v1" + dashscope.base_websocket_api_url = ( + "wss://dashscope.aliyuncs.com/api-ws/v1/inference" + ) + + language_hints = [language] if language else None + + recognition = Recognition( + model=DASHSCOPE_MODEL, + format="wav", + sample_rate=16000, + language_hints=language_hints, + callback=None, + ) + response = recognition.call(file=str(audio_path)) + + status = getattr(response, "status_code", None) + if status != 200: + msg = getattr(response, "message", None) or str(response) + request_id = getattr(response, "request_id", "") + raise RuntimeError( + f"DashScope {DASHSCOPE_MODEL} returned status={status} " + f"request_id={request_id}: {msg}" + ) - return resp.json() + output = getattr(response, "output", None) or {} + # Both shapes seen in the wild; honour either. + sentences = output.get("sentence") or output.get("sentences") or [] + return _convert_dashscope_to_scribe(sentences, language) def transcribe_one( @@ -97,8 +196,19 @@ def transcribe_one( ) -> Path: """Transcribe a single video. Returns path to transcript JSON. + `num_speakers` is accepted for backward compatibility with the previous + ElevenLabs Scribe interface (and with transcribe_batch.py's call site) + but is ignored — paraformer does not perform speaker diarization. A + one-line note is printed when a non-None value is supplied in verbose mode. + Cached: returns existing path immediately if the transcript already exists. """ + if num_speakers is not None and verbose: + print( + f" (note: --num-speakers={num_speakers} ignored — DashScope " + f"{DASHSCOPE_MODEL} has no speaker diarization)" + ) + transcripts_dir = edit_dir / "transcripts" transcripts_dir.mkdir(parents=True, exist_ok=True) out_path = transcripts_dir / f"{video.stem}.json" @@ -117,23 +227,33 @@ def transcribe_one( extract_audio(video, audio) size_mb = audio.stat().st_size / (1024 * 1024) if verbose: - print(f" uploading {video.stem}.wav ({size_mb:.1f} MB)", flush=True) - payload = call_scribe(audio, api_key, language, num_speakers) - - out_path.write_text(json.dumps(payload, indent=2)) + print( + f" streaming {video.stem}.wav ({size_mb:.1f} MB) " + f"to DashScope {DASHSCOPE_MODEL}", + flush=True, + ) + payload = call_dashscope(audio, api_key, language) + + # ensure_ascii=False so CJK characters are stored as-is (smaller file + + # human-readable when inspecting transcripts). + out_path.write_text( + json.dumps(payload, ensure_ascii=False, indent=2), + encoding="utf-8", + ) dt = time.time() - t0 if verbose: kb = out_path.stat().st_size / 1024 - print(f" saved: {out_path.name} ({kb:.1f} KB) in {dt:.1f}s") - if isinstance(payload, dict) and "words" in payload: - print(f" words: {len(payload['words'])}") + words_count = len(payload.get("words", [])) + print(f" saved: {out_path.name} ({kb:.1f} KB, {words_count} words) in {dt:.1f}s") return out_path def main() -> None: - ap = argparse.ArgumentParser(description="Transcribe a video with ElevenLabs Scribe") + ap = argparse.ArgumentParser( + description=f"Transcribe a video with DashScope {DASHSCOPE_MODEL}" + ) ap.add_argument("video", type=Path, help="Path to video file") ap.add_argument( "--edit-dir", @@ -145,13 +265,7 @@ def main() -> None: "--language", type=str, default=None, - help="Optional ISO language code (e.g., 'en'). Omit to auto-detect.", - ) - ap.add_argument( - "--num-speakers", - type=int, - default=None, - help="Optional number of speakers when known. Improves diarization accuracy.", + help="Language hint (e.g. 'zh', 'en', 'ja'). Omit to auto-detect.", ) args = ap.parse_args() @@ -167,7 +281,6 @@ def main() -> None: edit_dir=edit_dir, api_key=api_key, language=args.language, - num_speakers=args.num_speakers, ) diff --git a/helpers/transcribe_batch.py b/helpers/transcribe_batch.py index 5aeb1d6..3fe86e0 100644 --- a/helpers/transcribe_batch.py +++ b/helpers/transcribe_batch.py @@ -1,14 +1,15 @@ """Batch-transcribe every video in a directory with 4 parallel workers. -Walks for common video extensions, runs ElevenLabs Scribe on -each, writes transcripts to /edit/transcripts/.json. +Walks for common video extensions, transcribes each via +DashScope paraformer-v2 (see helpers/transcribe.py), writes transcripts +to /edit/transcripts/.json. Cached per-file: any source that already has a transcript is skipped. Usage: python helpers/transcribe_batch.py python helpers/transcribe_batch.py --workers 4 - python helpers/transcribe_batch.py --num-speakers 2 + python helpers/transcribe_batch.py --language zh python helpers/transcribe_batch.py --edit-dir /custom/edit """ @@ -48,13 +49,7 @@ def main() -> None: "--language", type=str, default=None, - help="Optional ISO language code. Omit to auto-detect per file.", - ) - ap.add_argument( - "--num-speakers", - type=int, - default=None, - help="Optional number of speakers. Improves diarization when known.", + help="Language hint (e.g. 'zh', 'en'). Omit to auto-detect per file.", ) args = ap.parse_args() @@ -91,7 +86,6 @@ def main() -> None: edit_dir=edit_dir, api_key=api_key, language=args.language, - num_speakers=args.num_speakers, verbose=False, ): v for v in pending diff --git a/main.py b/main.py new file mode 100644 index 0000000..3e43b97 --- /dev/null +++ b/main.py @@ -0,0 +1,86 @@ +"""Project-root entry point for the SRT-driven editor. + +A thin wrapper over `helpers/srt_driven_edit.py` that fills in the +`input/` -> `output/` layout described in CLAUDE.md so the common case +collapses to a single command: + + python main.py + +The wrapper injects these defaults only when the corresponding flag is +absent from `sys.argv`: + + --srt input/script.srt (always; required by srt_driven_edit) + --plan input/edit_plan.json (always; required by srt_driven_edit) + --source input/source.mp4 (only if the file exists) + --voice input/voice.wav (only if the file exists) + -o output/final.mp4 (always; output/ is auto-created) + +Anything you pass explicitly wins. Batch mode (`--batch `) skips +all single-job defaults so the manifest fully owns its own paths. + +Examples: + python main.py + python main.py --bg-volume 0.1 --style cjk-natural + python main.py --plan plans/custom.json -o out/custom.mp4 + python main.py --batch jobs.json --continue-on-error +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +# Wire helpers/ onto sys.path so `from srt_driven_edit import ...` works +# regardless of the user's cwd. +ROOT = Path(__file__).resolve().parent +sys.path.insert(0, str(ROOT / "helpers")) + +from srt_driven_edit import main as _srt_driven_main # noqa: E402 + + +def _has_flag(args: list[str], *flags: str) -> bool: + """True if any of `flags` appears in `args`, in either bare or `=` form.""" + for token in args: + for f in flags: + if token == f or token.startswith(f + "="): + return True + return False + + +def _inject_defaults(args: list[str]) -> list[str]: + """Add input/ -> output/ defaults for the flags the user did not provide.""" + out = list(args) + + # Batch mode owns its own paths via the manifest — never inject. + if _has_flag(out, "--batch"): + return out + + if not _has_flag(out, "--srt"): + out += ["--srt", "input/script.srt"] + if not _has_flag(out, "--plan"): + out += ["--plan", "input/edit_plan.json"] + + # --source is required for Form A plans but ignored for Form B. Inject + # only when the file is actually present so Form B users with no + # input/source.mp4 don't get a misleading "missing on disk" error. + if not _has_flag(out, "--source") and (ROOT / "input/source.mp4").exists(): + out += ["--source", "input/source.mp4"] + + # Same idea for voice: it's always optional, so inject only when present. + if not _has_flag(out, "--voice") and (ROOT / "input/voice.wav").exists(): + out += ["--voice", "input/voice.wav"] + + if not _has_flag(out, "-o", "--output"): + (ROOT / "output").mkdir(exist_ok=True) + out += ["-o", "output/final.mp4"] + + return out + + +def main() -> None: + sys.argv = [sys.argv[0]] + _inject_defaults(sys.argv[1:]) + _srt_driven_main() + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index c2cff29..651d0fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ license = { file = "LICENSE" } requires-python = ">=3.10" dependencies = [ "requests", + "dashscope>=1.20", "librosa", "matplotlib", "pillow", @@ -14,6 +15,7 @@ dependencies = [ [project.optional-dependencies] animations = ["manim"] +dev = ["pytest>=7"] [build-system] requires = ["setuptools>=61.0"] diff --git a/srt_video_editor.py b/srt_video_editor.py new file mode 100644 index 0000000..2477e4b --- /dev/null +++ b/srt_video_editor.py @@ -0,0 +1,478 @@ +"""srt_video_editor — minimal viable, learning-grade scaffold. + +================================================================ +THIS IS NOT THE PRODUCTION ENTRY POINT. Use `python main.py` (or +`python helpers/srt_driven_edit.py` directly) for any real work. +================================================================ + +What this script does: + - reads script.srt + edit_plan.json + - validates ids match + - prints the planned mapping + - cuts each cue out of source.mp4 to temp/clip_.mp4 + - lossless-concats into output/final.mp4 + +What it deliberately DOES NOT do (use main.py / srt_driven_edit.py +for any of these): + - encoding fallback — only UTF-8 / UTF-8-with-BOM SRT is accepted; + GB18030 / cp936 input will crash + - source range bounds check — a plan that overruns the source's + duration will surface as a confusing ffmpeg error, not a clear + "id=X exceeds source duration" up front + - QC report — no per-clip drift, no disk-usage accounting, no + structured failure record + - overwrite protection — every run silently `-y` overwrites the + temp/ clips and output/final.mp4 + - audio fades at cut points (you may hear pops on hard cuts) + - voice replacement, subtitle burn, color grade, HDR tone-map, + sync tails, segment cache, batch / per-episode discovery + +Self-contained on purpose: no imports from helpers/, so the entire +flow fits in one readable file. Use this to learn the pipeline; ship +with main.py. + +Usage: + python srt_video_editor.py + python srt_video_editor.py --srt input/script.srt --plan input/edit_plan.json \\ + --source input/source.mp4 \\ + --temp-dir temp/ --output output/final.mp4 +""" + +from __future__ import annotations + +import argparse +import json +import re +import subprocess +import sys +from pathlib import Path + + +# ---------- timestamp helpers ---------- + +_TS_RE = re.compile(r"(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})") + + +def parse_ts(s: str) -> float: + """Parse 'HH:MM:SS,ms' or 'HH:MM:SS.ms' to seconds.""" + m = _TS_RE.fullmatch(s.strip()) + if not m: + raise ValueError(f"bad timestamp: {s!r}") + h, mn, sec, ms = m.groups() + return int(h) * 3600 + int(mn) * 60 + int(sec) + int(ms.ljust(3, "0")) / 1000.0 + + +def format_ts(seconds: float) -> str: + """SRT-style HH:MM:SS,ms — comma separator (for log output / errors).""" + total_ms = int(round(seconds * 1000)) + h, rem = divmod(total_ms, 3600_000) + m, rem = divmod(rem, 60_000) + s, ms = divmod(rem, 1000) + return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" + + +def format_ts_dot(seconds: float) -> str: + """ffmpeg-style HH:MM:SS.ms — dot separator. Used for `-ss` / `-t` args. + + SRT timestamps use a comma between seconds and milliseconds; ffmpeg + expects a dot. The two forms refer to the same point in time but the + comma form is rejected by ffmpeg's parser. + """ + return format_ts(seconds).replace(",", ".") + + +# ---------- parsers ---------- + + +def parse_srt(path: Path) -> list[dict]: + """Return a list of {id, start, end, text} in file order. + + Tolerates UTF-8 with or without BOM, CRLF / LF line endings, and + SRT cue settings ('position:90% align:start') trailing the time line. + + Per-cue duration is validated here: `end <= start` is rejected with + an id-pinned error so the downstream ffmpeg call never sees a + non-positive `-t` argument. + """ + raw = path.read_text(encoding="utf-8-sig") + cues: list[dict] = [] + for block in re.split(r"\r?\n\r?\n+", raw.strip()): + lines = [ln for ln in block.splitlines() if ln.strip()] + if len(lines) < 2: + continue + try: + cid = int(lines[0].strip()) + except ValueError: + raise SystemExit(f"SRT id line is not an integer: {lines[0]!r}") + if "-->" not in lines[1]: + raise SystemExit(f"SRT block missing '-->' time line: {lines[1]!r}") + left, right = lines[1].split("-->", 1) + start = parse_ts(left.strip().split()[-1]) + end = parse_ts(right.strip().split()[0]) + if end <= start: + raise SystemExit( + f"SRT id={cid}: end {format_ts(end)} <= start " + f"{format_ts(start)} (srt_duration {end - start:.3f}s). " + f"Fix the timestamp in {path}." + ) + text = "\n".join(lines[2:]) + cues.append({"id": cid, "start": start, "end": end, "text": text}) + if not cues: + raise SystemExit(f"SRT has no cues: {path}") + return cues + + +def parse_plan(path: Path) -> list[dict]: + """Return a list of {id, source_start, source_end}. Only Form A is + accepted here (a flat JSON array); Form B is out of scope for the + minimal version. + + JSON syntax errors are reported as a SystemExit with the file path + plus the offending line / column / message, rather than as a bare + JSONDecodeError traceback. + """ + raw = path.read_text(encoding="utf-8") + try: + data = json.loads(raw) + except json.JSONDecodeError as e: + raise SystemExit( + f"edit_plan is not valid JSON: {path}: " + f"line {e.lineno} col {e.colno}: {e.msg}" + ) + if not isinstance(data, list): + raise SystemExit( + "edit_plan.json must be a JSON array of " + "{id, source_start, source_end} objects (Form A)." + ) + out: list[dict] = [] + for row in data: + try: + out.append({ + "id": int(row["id"]), + "source_start": parse_ts(row["source_start"]), + "source_end": parse_ts(row["source_end"]), + }) + except (KeyError, ValueError) as e: + raise SystemExit(f"plan row {row!r}: {e}") + return out + + +# ---------- validation ---------- + + +def validate_ids(cues: list[dict], plan: list[dict]) -> None: + """Each id must appear exactly once in both sides, and the two id sets + must be equal. Any deviation is a hard failure with a clear message. + """ + cue_ids = [c["id"] for c in cues] + plan_ids = [p["id"] for p in plan] + + dup_cue = {i for i in cue_ids if cue_ids.count(i) > 1} + if dup_cue: + raise SystemExit(f"SRT has duplicate ids: {sorted(dup_cue)}") + dup_plan = {i for i in plan_ids if plan_ids.count(i) > 1} + if dup_plan: + raise SystemExit(f"edit_plan has duplicate ids: {sorted(dup_plan)}") + + only_srt = set(cue_ids) - set(plan_ids) + only_plan = set(plan_ids) - set(cue_ids) + if only_srt or only_plan: + msg = [] + if only_srt: + msg.append(f"in SRT but missing in plan: {sorted(only_srt)}") + if only_plan: + msg.append(f"in plan but missing in SRT: {sorted(only_plan)}") + raise SystemExit("id mismatch: " + "; ".join(msg)) + + +# ---------- report ---------- + + +def probe_clip_duration(path: Path) -> float | None: + """Return the duration of `path` in seconds via ffprobe. + + Returns None if ffprobe is missing or the file is unreadable — + verification is informational, so probe failures should not abort + the run after a successful extraction. + """ + cmd = [ + "ffprobe", "-v", "error", + "-show_entries", "format=duration", + "-of", "default=noprint_wrappers=1:nokey=1", + str(path), + ] + try: + proc = subprocess.run( + cmd, capture_output=True, text=True, + encoding="utf-8", errors="replace", + ) + except FileNotFoundError: + return None + if proc.returncode != 0: + return None + try: + return float(proc.stdout.strip()) + except ValueError: + return None + + +def cut_clip(source: Path, start: float, cut_duration: float, + out_path: Path) -> None: + """Cut `cut_duration` seconds starting at `start` from source. + + `-ss` placed before `-i` makes ffmpeg do a fast container-level seek + to the nearest keyframe, then libx264 re-encodes from there — + frame-accurate at the cost of one encode pass. Stream copy (`-c copy`) + would be faster but cuts at keyframes only, which makes downstream + concat / sync less predictable; we trade a few seconds of encode + time per clip for cleaner cut boundaries. + + Audio is mapped optionally via `-map 0:a?` so a video-only source + does not crash the run. Video is the first stream (`-map 0:v:0`). + + Raises SystemExit with the full ffmpeg command + stderr on failure + so the caller never has to scroll the terminal to find what went wrong. + """ + cmd = [ + "ffmpeg", "-y", "-hide_banner", "-nostats", + "-ss", format_ts_dot(start), + "-i", str(source), + "-t", format_ts_dot(cut_duration), + "-map", "0:v:0", + "-map", "0:a?", + "-c:v", "libx264", "-preset", "veryfast", "-crf", "18", + "-pix_fmt", "yuv420p", + "-c:a", "aac", "-b:a", "192k", + "-ar", "48000", "-ac", "2", + "-movflags", "+faststart", + str(out_path), + ] + try: + proc = subprocess.run( + cmd, capture_output=True, text=True, + encoding="utf-8", errors="replace", + ) + except FileNotFoundError: + raise SystemExit( + "ffmpeg not found on PATH. Install ffmpeg " + "(`winget install Gyan.FFmpeg` on Windows, " + "`brew install ffmpeg` on macOS) and re-run." + ) + if proc.returncode != 0: + raise SystemExit( + f"ffmpeg failed on {out_path.name} (exit {proc.returncode})\n" + f"--- command ---\n{' '.join(cmd)}\n" + f"--- stderr ---\n{proc.stderr or '(empty)'}" + ) + + +def extract_clips( + cues: list[dict], + plan: list[dict], + source: Path, + temp_dir: Path, +) -> list[Path]: + """Cut one clip per cue. Returns the list of output paths in cue-id order. + + Per-cue duration logic: + source_duration = plan.source_end - plan.source_start + srt_duration = cue.end - cue.start + + source_duration <= 0 -> hard error pointing at the id + source_duration < srt_duration -> hard error (source is too short + to cover the SRT cue; either + extend the source range or + shorten the cue) + source_duration >= srt_duration -> cut exactly `srt_duration` + starting at source_start. Any + extra source tail is discarded. + + Stale `clip_*.mp4` files in `temp_dir` are removed before cutting so + a previous failed run with sparser ids doesn't leave misleading + leftovers next to the new clips. The `_concat.txt` from a future + concat step is NOT touched here — concat owns its own list file. + + Filenames are `clip_.mp4`, indexed by SRT id (not position). + """ + plan_by_id = {p["id"]: p for p in plan} + temp_dir.mkdir(parents=True, exist_ok=True) + + # Pre-clean stale clip files. Only the clip_*.mp4 pattern so user- + # created neighbours (notes, recordings, etc.) are left alone. + stale = sorted(temp_dir.glob("clip_*.mp4")) + if stale: + print(f"clearing {len(stale)} stale clip(s) from {temp_dir}/") + for p in stale: + p.unlink() + + print() + print(f"cutting {len(cues)} clip(s) -> {temp_dir}/") + outputs: list[Path] = [] + targets: list[float] = [] # parallel to outputs — used by post-verify pass + for cue in sorted(cues, key=lambda c: c["id"]): + cid = cue["id"] + p = plan_by_id[cid] + start = p["source_start"] + source_duration = p["source_end"] - start + srt_duration = cue["end"] - cue["start"] + + if source_duration <= 0: + raise SystemExit( + f"plan id={cid}: source_end {format_ts(p['source_end'])} <= " + f"source_start {format_ts(start)} " + f"(source_duration {source_duration:.3f}s)" + ) + if source_duration < srt_duration - 1e-6: + raise SystemExit( + f"plan id={cid}: source range is shorter than SRT cue. " + f"source_duration={source_duration:.3f}s, " + f"srt_duration={srt_duration:.3f}s. " + f"Extend the source range or shorten the SRT cue." + ) + # source_duration >= srt_duration: cut exactly srt_duration + cut_duration = srt_duration + + out_path = temp_dir / f"clip_{cid:03d}.mp4" + text_preview = cue["text"].replace("\n", " ").strip() + if len(text_preview) > 60: + text_preview = text_preview[:57] + "..." + print( + f" id={cid:>3} src@{format_ts(start)} " + f"cut={cut_duration:.3f}s -> {out_path}\n" + f" text: {text_preview!r}" + ) + cut_clip(source, start, cut_duration, out_path) + outputs.append(out_path) + targets.append(cut_duration) + + # ---- ffprobe verification ---- + # Container duration can drift a few hundredths of a second from the + # target after re-encoding (libx264 GOP / first-keyframe boundary). + # Print the actual vs target side by side so the user can spot a + # clip that's wildly off — e.g. ffmpeg silently truncated to 0s. + print() + print(f"verifying {len(outputs)} clip(s) with ffprobe:") + for out_path, target in zip(outputs, targets): + actual = probe_clip_duration(out_path) + if actual is None: + print(f" {out_path.name}: (probe failed),target: {target:.2f}s") + else: + print(f" {out_path.name}: {actual:.2f}s,target: {target:.2f}s") + + return outputs + + +def concat_clips(clip_paths: list[Path], out_path: Path) -> None: + """Lossless concat of pre-encoded clips via ffmpeg's concat demuxer. + + The clips produced by `cut_clip` all share the same encoder params + (libx264, yuv420p, aac), so `-c copy` is safe and instant — no + re-encode. The concat list file is written next to the first clip + (typically `temp/_concat.txt`) and removed in `finally` so a clean + run leaves a tidy temp/ and a failed run doesn't leave a stale list. + + Raises SystemExit with the full ffmpeg command + stderr on failure. + """ + if not clip_paths: + raise SystemExit("concat: no clips to concatenate") + + list_file = clip_paths[0].parent / "_concat.txt" + list_file.write_text( + "".join(f"file '{p.resolve().as_posix()}'\n" for p in clip_paths), + encoding="utf-8", + ) + out_path.parent.mkdir(parents=True, exist_ok=True) + cmd = [ + "ffmpeg", "-y", "-hide_banner", "-nostats", + "-f", "concat", "-safe", "0", + "-i", str(list_file), + "-c", "copy", + "-movflags", "+faststart", + str(out_path), + ] + try: + try: + proc = subprocess.run( + cmd, capture_output=True, text=True, + encoding="utf-8", errors="replace", + ) + except FileNotFoundError: + raise SystemExit( + "ffmpeg not found on PATH. Install ffmpeg " + "(`winget install Gyan.FFmpeg` on Windows, " + "`brew install ffmpeg` on macOS) and re-run." + ) + if proc.returncode != 0: + raise SystemExit( + f"ffmpeg concat failed (exit {proc.returncode})\n" + f"--- command ---\n{' '.join(cmd)}\n" + f"--- stderr ---\n{proc.stderr or '(empty)'}" + ) + finally: + list_file.unlink(missing_ok=True) + print(f" concat {len(clip_paths)} clip(s) -> {out_path}") + + +def print_report(cues: list[dict], plan: list[dict]) -> None: + plan_by_id = {p["id"]: p for p in plan} + print(f"{len(cues)} cue(s), all ids matched.") + print() + header = f" {'ID':>3} {'OUTPUT (cue)':<23} {'SOURCE (planned)':<23} TEXT" + print(header) + print(f" {'-' * 3} {'-' * 23} {'-' * 23} {'-' * 4}") + for cue in sorted(cues, key=lambda c: c["id"]): + p = plan_by_id[cue["id"]] + out_range = f"{format_ts(cue['start'])} -> {format_ts(cue['end'])}" + src_range = f"{format_ts(p['source_start'])} -> {format_ts(p['source_end'])}" + preview = cue["text"].replace("\n", " ") + if len(preview) > 50: + preview = preview[:47] + "..." + print(f" {cue['id']:>3} {out_range:<23} {src_range:<23} {preview}") + + +# ---------- entry ---------- + + +def main() -> None: + ap = argparse.ArgumentParser( + description=( + "MINIMAL learning-grade SRT-driven editor. NOT for production — " + "use `python main.py` for that. This script reads script.srt + " + "edit_plan.json, validates id matching, prints the planned range " + "table, cuts each cue out of source.mp4 into temp/clip_.mp4, " + "then lossless-concats them into output/final.mp4. No encoding " + "fallback, no range-bounds check, no QC report, no overwrite " + "protection." + ), + ) + ap.add_argument("--srt", type=Path, default=Path("input/script.srt")) + ap.add_argument("--plan", type=Path, default=Path("input/edit_plan.json")) + ap.add_argument("--source", type=Path, default=Path("input/source.mp4")) + ap.add_argument("--temp-dir", type=Path, default=Path("temp")) + ap.add_argument("--output", type=Path, default=Path("output/final.mp4")) + args = ap.parse_args() + + print( + "[srt_video_editor: minimal mode — UTF-8 SRT only, no range/QC " + "checks, temp/ + output/ will be overwritten. For production " + "use `python main.py`.]" + ) + + for p in (args.srt, args.plan, args.source): + if not p.is_file(): + raise SystemExit(f"file not found: {p}") + + cues = parse_srt(args.srt) + plan = parse_plan(args.plan) + validate_ids(cues, plan) + print_report(cues, plan) + clip_paths = extract_clips(cues, plan, args.source, args.temp_dir) + print() + print(f"concatenating -> {args.output}") + concat_clips(clip_paths, args.output) + print() + print(f"done. final video: {args.output}") + + +if __name__ == "__main__": + main() diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..7716455 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,153 @@ +"""Shared fixtures for srt_driven_edit pytest suite. + +Generates session-scoped synthetic media via ffmpeg's lavfi sources so the +real extract/concat/burn pipeline can be exercised without bundling binary +fixtures. +""" + +from __future__ import annotations + +import json +import shutil +import subprocess +import sys +from pathlib import Path + +import pytest + +# Make the skill's helpers/ importable as a flat package (matches the +# `python helpers/srt_driven_edit.py` invocation contract). +HELPERS = Path(__file__).resolve().parent.parent / "helpers" +sys.path.insert(0, str(HELPERS)) + + +FFMPEG = shutil.which("ffmpeg") +FFPROBE = shutil.which("ffprobe") + + +def pytest_collection_modifyitems(config, items): + """Auto-skip all tests in this dir if ffmpeg/ffprobe missing.""" + if FFMPEG and FFPROBE: + return + marker = pytest.mark.skip(reason="ffmpeg or ffprobe not on PATH") + for item in items: + item.add_marker(marker) + + +# --------------------------------------------------------------------------- +# Synthetic media (session-scoped — each costs a few seconds to render) +# --------------------------------------------------------------------------- + + +def _ffmpeg(*args: str) -> None: + cmd = ["ffmpeg", "-y", "-hide_banner", "-loglevel", "error", *args] + r = subprocess.run(cmd, capture_output=True, text=True, + encoding="utf-8", errors="replace") + if r.returncode != 0: + raise RuntimeError(f"ffmpeg failed:\n cmd: {' '.join(cmd)}\n stderr: {r.stderr}") + + +@pytest.fixture(scope="session") +def synth_av(tmp_path_factory) -> Path: + """30s 1080p@24 testsrc2 + 440Hz sine. Spans long enough for sub-second cuts.""" + d = tmp_path_factory.mktemp("synth") + out = d / "av.mp4" + _ffmpeg( + "-f", "lavfi", "-i", "testsrc2=size=1920x1080:rate=24:duration=30", + "-f", "lavfi", "-i", "sine=frequency=440:duration=30", + "-c:v", "libx264", "-preset", "ultrafast", "-pix_fmt", "yuv420p", + "-c:a", "aac", "-b:a", "128k", "-ar", "48000", + "-shortest", + str(out), + ) + return out + + +@pytest.fixture(scope="session") +def synth_v_only(tmp_path_factory) -> Path: + """30s 1080p video without an audio track. Exercises the auto-degrade path.""" + d = tmp_path_factory.mktemp("synth_vonly") + out = d / "v_only.mp4" + _ffmpeg( + "-f", "lavfi", "-i", "testsrc2=size=1920x1080:rate=24:duration=30", + "-an", + "-c:v", "libx264", "-preset", "ultrafast", "-pix_fmt", "yuv420p", + "-t", "30", + str(out), + ) + return out + + +@pytest.fixture(scope="session") +def synth_voice(tmp_path_factory) -> Path: + """5s 880Hz sine — drop-in per-segment voice clip.""" + d = tmp_path_factory.mktemp("synth_voice") + out = d / "voice.wav" + _ffmpeg( + "-f", "lavfi", "-i", "sine=frequency=880:duration=5", + "-ar", "48000", "-ac", "2", + str(out), + ) + return out + + +# --------------------------------------------------------------------------- +# Helpers for crafting SRT / plan files inside a test's tmp_path +# --------------------------------------------------------------------------- + + +def fmt_ts(s: float) -> str: + total_ms = int(round(s * 1000)) + h, rem = divmod(total_ms, 3600_000) + m, rem = divmod(rem, 60_000) + sec, ms = divmod(rem, 1000) + return f"{h:02d}:{m:02d}:{sec:02d},{ms:03d}" + + +def write_srt(path: Path, cues: list[tuple[int, float, float, str]], + encoding: str = "utf-8") -> None: + """Write an SRT. cues: [(id, start_s, end_s, text)].""" + lines: list[str] = [] + for cid, s, e, t in cues: + lines.append(str(cid)) + lines.append(f"{fmt_ts(s)} --> {fmt_ts(e)}") + lines.append(t) + lines.append("") + path.write_bytes("\n".join(lines).encode(encoding)) + + +def write_plan_form_a(path: Path, + segments: list[tuple[int, float, float]]) -> None: + """Legacy array form. segments: [(id, src_start_s, src_end_s)].""" + data = [ + {"id": cid, "source_start": fmt_ts(s), "source_end": fmt_ts(e)} + for cid, s, e in segments + ] + path.write_text(json.dumps(data, indent=2), encoding="utf-8") + + +def write_plan_form_b(path: Path, sources: dict[str, str], + segments: list[dict], + voices: dict[str, str] | None = None) -> None: + """Object form with multi-source / multi-voice support.""" + data: dict = {"sources": sources, "segments": segments} + if voices: + data["voices"] = voices + path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") + + +@pytest.fixture +def helpers_ns(): + """Convenience: bundle the helpers module + write_* functions in one object.""" + import srt_driven_edit as sde + + class NS: + pass + + ns = NS() + ns.sde = sde + ns.write_srt = write_srt + ns.write_plan_form_a = write_plan_form_a + ns.write_plan_form_b = write_plan_form_b + ns.fmt_ts = fmt_ts + return ns diff --git a/tests/test_main_entry.py b/tests/test_main_entry.py new file mode 100644 index 0000000..579cb54 --- /dev/null +++ b/tests/test_main_entry.py @@ -0,0 +1,104 @@ +"""Tests for the project-root main.py wrapper. + +Only the default-injection logic is unit-tested here; the actual run_job +path is exercised by tests/test_srt_driven_*. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT)) + + +@pytest.fixture +def main_mod(monkeypatch, tmp_path): + """Fresh import of main.py rooted at a tmp dir so ROOT/input doesn't leak.""" + monkeypatch.chdir(tmp_path) + # Force-reload main with a new ROOT pointing at tmp_path so file-existence + # checks reflect what the test wrote, not what's actually in the repo root. + import importlib + import main as _m + importlib.reload(_m) + _m.ROOT = tmp_path # rebind so input/source.mp4 etc. resolve in tmp + return _m + + +def test_defaults_when_no_flags(main_mod, tmp_path): + """No flags + nothing in input/ → srt/plan/output defaults, no source/voice.""" + out = main_mod._inject_defaults([]) + assert "--srt" in out and "input/script.srt" in out + assert "--plan" in out and "input/edit_plan.json" in out + assert "-o" in out and "output/final.mp4" in out + # input/source.mp4 doesn't exist → --source NOT injected + assert "--source" not in out + assert "--voice" not in out + # output/ dir was created + assert (tmp_path / "output").is_dir() + + +def test_injects_source_when_present(main_mod, tmp_path): + (tmp_path / "input").mkdir() + (tmp_path / "input" / "source.mp4").write_bytes(b"x") + out = main_mod._inject_defaults([]) + assert "--source" in out and "input/source.mp4" in out + + +def test_injects_voice_when_present(main_mod, tmp_path): + (tmp_path / "input").mkdir() + (tmp_path / "input" / "voice.wav").write_bytes(b"x") + out = main_mod._inject_defaults([]) + assert "--voice" in out and "input/voice.wav" in out + + +def test_user_flags_win(main_mod, tmp_path): + (tmp_path / "input").mkdir() + (tmp_path / "input" / "source.mp4").write_bytes(b"x") + user = ["--srt", "scripts/ep01.srt", + "--plan", "plans/ep01.json", + "--source", "raw/ep01.mp4", + "-o", "out/ep01.mp4"] + out = main_mod._inject_defaults(user) + # User-supplied wins; no duplicate defaults appended + assert out.count("--srt") == 1 and "scripts/ep01.srt" in out + assert out.count("--plan") == 1 and "plans/ep01.json" in out + assert out.count("--source") == 1 and "raw/ep01.mp4" in out + assert out.count("-o") == 1 and "out/ep01.mp4" in out + # Default input/script.srt etc. NOT injected + assert "input/script.srt" not in out + assert "input/edit_plan.json" not in out + + +def test_equals_form_recognized(main_mod, tmp_path): + """--flag=value form must count as 'flag is set' so we don't double-inject.""" + out = main_mod._inject_defaults(["--srt=scripts/x.srt", "--plan=plans/x.json"]) + # Defaults must NOT be appended. Both the user's tokens and any default + # bare `--srt` / `--plan` would otherwise coexist. + assert "--srt=scripts/x.srt" in out + assert "--plan=plans/x.json" in out + assert "--srt" not in out # no bare default flag + assert "--plan" not in out + assert "input/script.srt" not in out + assert "input/edit_plan.json" not in out + + +def test_batch_mode_skips_all_defaults(main_mod, tmp_path): + (tmp_path / "input").mkdir() + (tmp_path / "input" / "source.mp4").write_bytes(b"x") + out = main_mod._inject_defaults(["--batch", "jobs.json"]) + # No single-job defaults — manifest owns paths. + assert "--srt" not in out + assert "--plan" not in out + assert "--source" not in out + assert "-o" not in out + assert "--output" not in out + + +def test_short_output_flag_recognized(main_mod, tmp_path): + out = main_mod._inject_defaults(["-o", "custom/path.mp4"]) + assert out.count("-o") == 1 + assert "output/final.mp4" not in out diff --git a/tests/test_recommend_edit_plan.py b/tests/test_recommend_edit_plan.py new file mode 100644 index 0000000..4453cde --- /dev/null +++ b/tests/test_recommend_edit_plan.py @@ -0,0 +1,493 @@ +"""Tests for recommend_edit_plan.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + + +@pytest.fixture +def rec(): + """Convenience: import the module under test as a fixture.""" + import recommend_edit_plan as r + return r + + +@pytest.fixture +def sde(): + import srt_driven_edit as s + return s + + +def write_transcript(path: Path, words: list[dict]) -> None: + """Wrap a flat list of {text,start,end,type} dicts in a Scribe-style envelope.""" + path.write_text( + json.dumps({"language_code": "en", "words": words}, ensure_ascii=False), + encoding="utf-8", + ) + + +def write_srt_cues(path, cues, helpers_ns): + helpers_ns.write_srt(path, cues, encoding="utf-8") + + +# --------------------------------------------------------------------------- +# 1. English exact match — high score, correct range +# --------------------------------------------------------------------------- + + +def test_english_exact_match(rec, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + # Cue duration matches candidate duration exactly so duration warnings stay quiet. + helpers_ns.write_srt(srt, [ + (1, 0.0, 1.0, "Hello world"), + ]) + write_transcript(transcript, [ + {"text": "Hello", "start": 5.0, "end": 5.4, "type": "word"}, + {"text": "world.", "start": 5.4, "end": 6.0, "type": "word"}, + {"text": "Other", "start": 10.0, "end": 10.5, "type": "word"}, + ]) + + assignments = rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + ) + assert len(assignments) == 1 + a = assignments[0] + assert a.cand is not None + assert abs(a.cand.start - 5.0) < 1e-6 + assert abs(a.cand.end - 6.0) < 1e-6 + assert a.score > 0.85, f"exact-text match should score high, got {a.score}" + assert not a.warnings, f"unexpected warnings: {a.warnings}" + + +# --------------------------------------------------------------------------- +# 2. Chinese match — CJK Jaccard path +# --------------------------------------------------------------------------- + + +def test_chinese_match(rec, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [ + (1, 0.0, 3.0, "我们这季度把规划器重写了。"), + ]) + write_transcript(transcript, [ + {"text": "我们", "start": 12.0, "end": 12.4, "type": "word"}, + {"text": "这", "start": 12.4, "end": 12.5, "type": "word"}, + {"text": "季度", "start": 12.5, "end": 13.0, "type": "word"}, + {"text": "把", "start": 13.0, "end": 13.1, "type": "word"}, + {"text": "规划器", "start": 13.1, "end": 14.0, "type": "word"}, + {"text": "重写了。", "start": 14.0, "end": 15.0, "type": "word"}, + # A distractor far away + {"text": "不相关的内容。", "start": 25.0, "end": 26.0, "type": "word"}, + ]) + + assignments = rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + ) + a = assignments[0] + assert a.cand is not None + assert abs(a.cand.start - 12.0) < 1e-6 + assert abs(a.cand.end - 15.0) < 1e-6 + assert a.score > 0.7 + + +# --------------------------------------------------------------------------- +# 3. Punctuation + case differences still match +# --------------------------------------------------------------------------- + + +def test_punct_and_case_invariant(rec, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + # SRT: lowercase, no punct, matching duration + helpers_ns.write_srt(srt, [ + (1, 0.0, 2.0, "hello there friends"), + ]) + # Transcript: mixed case + phrase punct (commas keep words grouped); the + # SENTENCE-end '!' only on the last word so all three stay in one candidate. + write_transcript(transcript, [ + {"text": "HELLO,", "start": 1.0, "end": 1.5, "type": "word"}, + {"text": "There,", "start": 1.5, "end": 2.0, "type": "word"}, + {"text": "FRIENDS!", "start": 2.0, "end": 3.0, "type": "word"}, + ]) + assignments = rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + ) + a = assignments[0] + assert a.cand is not None + assert a.score > 0.85, f"normalization should erase case+punct, got {a.score}" + + +# --------------------------------------------------------------------------- +# 4. Silence gap splits candidates +# --------------------------------------------------------------------------- + + +def test_silence_gap_splits(rec, tmp_path): + """Two phrases separated by a 1.0s silence should produce two candidates, + not one — even though neither phrase ends in sentence-end punctuation. + """ + transcript = tmp_path / "transcript.json" + write_transcript(transcript, [ + {"text": "alpha", "start": 1.0, "end": 1.4, "type": "word"}, + {"text": "beta", "start": 1.4, "end": 2.0, "type": "word"}, + # 1.0s silence + {"text": "gamma", "start": 3.0, "end": 3.4, "type": "word"}, + {"text": "delta", "start": 3.4, "end": 4.0, "type": "word"}, + ]) + words = rec.load_transcript_words(transcript) + candidates = rec.build_candidates(words, gap_threshold=0.5) + assert len(candidates) == 2 + assert abs(candidates[0].start - 1.0) < 1e-6 and abs(candidates[0].end - 2.0) < 1e-6 + assert abs(candidates[1].start - 3.0) < 1e-6 and abs(candidates[1].end - 4.0) < 1e-6 + # Tightening the gap shouldn't merge them (still well over threshold) + # Loosening past 1.0s should: + merged = rec.build_candidates(words, gap_threshold=1.1) + assert len(merged) == 1 + + +# --------------------------------------------------------------------------- +# 5. Low-score match emits warning +# --------------------------------------------------------------------------- + + +def test_low_score_warning(rec, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + # Cue text shares almost no tokens with any candidate + helpers_ns.write_srt(srt, [ + (1, 0.0, 2.0, "quantum entanglement decoherence"), + ]) + write_transcript(transcript, [ + {"text": "apple", "start": 1.0, "end": 1.5, "type": "word"}, + {"text": "banana", "start": 1.5, "end": 2.0, "type": "word"}, + ]) + assignments = rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + min_score=0.5, # set high to force the warning + ) + a = assignments[0] + assert a.cand is not None # still got SOME candidate + assert any("low score" in w for w in a.warnings) + + +# --------------------------------------------------------------------------- +# 6. SRT id ordering preserved in output +# --------------------------------------------------------------------------- + + +def test_ids_preserved(rec, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [ + (1, 0.0, 1.0, "alpha"), + (2, 1.0, 2.0, "beta"), + (3, 2.0, 3.0, "gamma"), + ]) + write_transcript(transcript, [ + {"text": "alpha.", "start": 1.0, "end": 1.5, "type": "word"}, + {"text": "beta.", "start": 5.0, "end": 5.5, "type": "word"}, + {"text": "gamma.", "start": 10.0, "end": 10.5, "type": "word"}, + ]) + out = tmp_path / "plan.json" + rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=out, + ) + plan_rows = json.loads(out.read_text(encoding="utf-8")) + assert [r["id"] for r in plan_rows] == [1, 2, 3] + + +# --------------------------------------------------------------------------- +# 7. Output is parseable by srt_driven_edit.parse_plan +# --------------------------------------------------------------------------- + + +def test_output_is_parseable_by_sde(rec, sde, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [ + (1, 0.0, 1.0, "alpha"), + (2, 1.0, 2.0, "beta"), + ]) + write_transcript(transcript, [ + {"text": "alpha.", "start": 1.0, "end": 1.5, "type": "word"}, + {"text": "beta.", "start": 5.0, "end": 5.5, "type": "word"}, + ]) + out = tmp_path / "plan.json" + rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=out, + ) + + sources, voices, entries = sde.parse_plan(out) + assert sources == {} and voices == {} # Form A — no maps + assert [e.id for e in entries] == [1, 2] + assert all(e.source_name == "_default" for e in entries) + assert entries[0].source_start == 1.0 and entries[0].source_end == 1.5 + assert entries[1].source_start == 5.0 and entries[1].source_end == 5.5 + + +# --------------------------------------------------------------------------- +# 8. Form B output carries the source name +# --------------------------------------------------------------------------- + + +def test_form_b_output(rec, sde, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [(1, 0.0, 1.0, "alpha")]) + write_transcript(transcript, [ + {"text": "alpha.", "start": 1.0, "end": 1.5, "type": "word"}, + ]) + out = tmp_path / "plan.json" + rec.recommend( + script_srt=srt, transcript=transcript, + source=tmp_path / "src.mp4", source_name="TAKE_A", + output_format="form-b", output=out, + ) + data = json.loads(out.read_text(encoding="utf-8")) + assert "TAKE_A" in data["sources"] + assert data["segments"][0]["source"] == "TAKE_A" + # And it's parseable by sde.parse_plan too + sources, _, entries = sde.parse_plan(out) + assert "TAKE_A" in sources + assert entries[0].source_name == "TAKE_A" + + +# --------------------------------------------------------------------------- +# 9. No candidates → hard fail (per spec) +# --------------------------------------------------------------------------- + + +def test_no_candidates_aborts(rec, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [(1, 0.0, 1.0, "alpha")]) + # Transcript has only audio_event (no words) + write_transcript(transcript, [ + {"text": "(laughter)", "start": 1.0, "end": 2.0, "type": "audio_event"}, + ]) + with pytest.raises(SystemExit): + rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + ) + + +# --------------------------------------------------------------------------- +# 10. Review markdown shows score + warnings +# --------------------------------------------------------------------------- + + +def test_review_markdown_content(rec, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [(1, 0.0, 2.0, "Hello world")]) + write_transcript(transcript, [ + {"text": "Hello", "start": 1.0, "end": 1.5, "type": "word"}, + {"text": "world.", "start": 1.5, "end": 2.0, "type": "word"}, + ]) + out = tmp_path / "plan.json" + rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=out, + ) + review = (out.with_name(out.stem + "_review.md")).read_text(encoding="utf-8") + assert "cue id=1" in review + assert "Hello world" in review + assert "**score**" in review + assert "**source range**" in review + + +# --------------------------------------------------------------------------- +# 11. End-to-end: recommend → run_job → final mp4 exists +# --------------------------------------------------------------------------- + + +def test_backward_source_jump_warns_by_default(rec, helpers_ns, tmp_path): + """When a later cue matches an earlier source position, a warning fires. + + Two cues both want a line that appears twice in the source. Greedy + matching with no constraint picks the EARLIEST instance for the + earlier-ID cue (because Jaccard score breaks ties by first hit), then + the SECOND instance for the later cue — so source time advances and + no warning. We construct the inverse: make the earlier-ID cue prefer + the LATER instance (longer duration → better duration_similarity), + leaving only the earlier instance for the later cue, producing a + backward jump that must be flagged. + """ + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + # Cue 1 prefers a 2.0s match; cue 2 prefers a 1.0s match. + helpers_ns.write_srt(srt, [ + (1, 0.0, 2.0, "alpha alpha alpha"), + (2, 2.0, 3.0, "alpha alpha alpha"), + ]) + write_transcript(transcript, [ + # Early instance: 1.0s duration → cue 2 will prefer it + {"text": "alpha", "start": 5.0, "end": 5.4, "type": "word"}, + {"text": "alpha", "start": 5.4, "end": 5.7, "type": "word"}, + {"text": "alpha.", "start": 5.7, "end": 6.0, "type": "word"}, + # Late instance: 2.0s duration → cue 1 will prefer it + {"text": "alpha", "start": 20.0, "end": 20.7, "type": "word"}, + {"text": "alpha", "start": 20.7, "end": 21.4, "type": "word"}, + {"text": "alpha.", "start": 21.4, "end": 22.0, "type": "word"}, + ]) + + assignments = rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + ) + # Cue 1 picked the 2s late instance, cue 2 picked the 1s early one → backward + assert assignments[0].cand.start >= 20.0 + assert assignments[1].cand.start <= 6.0 + backward_warnings = [w for w in assignments[1].warnings + if "backward" in w] + assert backward_warnings, \ + f"expected a backward-time warning on cue 2, got: {assignments[1].warnings}" + + +def test_monotonic_source_prevents_backward_jump(rec, helpers_ns, tmp_path): + """With --monotonic-source the same setup must NOT pick the early + instance for cue 2. The constraint forces cue 2's candidate to start + at or after cue 1's end.""" + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [ + (1, 0.0, 2.0, "alpha alpha alpha"), + (2, 2.0, 3.0, "alpha alpha alpha"), + ]) + # Same as the previous test PLUS a third late instance so cue 2 has a + # forward option under the constraint. + write_transcript(transcript, [ + {"text": "alpha", "start": 5.0, "end": 5.4, "type": "word"}, + {"text": "alpha", "start": 5.4, "end": 5.7, "type": "word"}, + {"text": "alpha.", "start": 5.7, "end": 6.0, "type": "word"}, + {"text": "alpha", "start": 20.0, "end": 20.7, "type": "word"}, + {"text": "alpha", "start": 20.7, "end": 21.4, "type": "word"}, + {"text": "alpha.", "start": 21.4, "end": 22.0, "type": "word"}, + {"text": "alpha", "start": 30.0, "end": 30.4, "type": "word"}, + {"text": "alpha", "start": 30.4, "end": 30.7, "type": "word"}, + {"text": "alpha.", "start": 30.7, "end": 31.0, "type": "word"}, + ]) + + assignments = rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + monotonic_source=True, + ) + assert assignments[0].cand.start >= 20.0 + assert assignments[1].cand.start >= assignments[0].cand.end - 1e-6, \ + "cue 2's candidate must start at or after cue 1's end under monotonic" + # No backward warning under monotonic mode + assert not any("backward" in w for w in assignments[1].warnings) + + +def test_max_source_gap_warning(rec, helpers_ns, tmp_path): + """--max-source-gap fires a warning when the gap exceeds the threshold.""" + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [ + (1, 0.0, 1.0, "alpha"), + (2, 1.0, 2.0, "beta"), + ]) + write_transcript(transcript, [ + {"text": "alpha.", "start": 1.0, "end": 1.5, "type": "word"}, + # Big gap to next: beta is at 60+ seconds away + {"text": "beta.", "start": 65.0, "end": 65.5, "type": "word"}, + ]) + assignments = rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + max_source_gap_warn=10.0, # gap is ~63.5s, well over 10s + ) + jump_warnings = [w for w in assignments[1].warnings + if "source-time jump" in w] + assert jump_warnings, \ + f"expected a big-gap warning, got: {assignments[1].warnings}" + + +def test_monotonic_with_no_forward_candidate_fails(rec, helpers_ns, tmp_path): + """If no candidate can satisfy the monotonic constraint, the cue gets + the 'no candidate available at or after ...' warning and write_plan + hard-fails (per the no-candidate contract).""" + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [ + (1, 0.0, 1.0, "alpha"), + (2, 1.0, 2.0, "beta"), + ]) + write_transcript(transcript, [ + # alpha matches at 20s, taking cue 1 + {"text": "alpha.", "start": 20.0, "end": 21.0, "type": "word"}, + # beta only available BEFORE alpha — monotonic can't reach it + {"text": "beta.", "start": 5.0, "end": 6.0, "type": "word"}, + ]) + with pytest.raises(SystemExit): + rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + monotonic_source=True, + ) + + +def test_e2e_recommend_then_render( + rec, sde, helpers_ns, synth_av, tmp_path +): + """Full chain: fabricated transcript → recommend → run_job → final.mp4.""" + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + plan = tmp_path / "plan.json" + out_mp4 = tmp_path / "final.mp4" + + # 3 cues totaling 6s of output + helpers_ns.write_srt(srt, [ + (1, 0.0, 2.0, "alpha beta"), + (2, 2.0, 4.0, "gamma delta"), + (3, 4.0, 6.0, "epsilon zeta"), + ]) + # Transcript: words that match each cue at distinct, valid times in synth_av (30s) + # Each candidate is exactly 2s — matches cue duration exactly so no on-short needed. + write_transcript(transcript, [ + {"text": "alpha", "start": 1.0, "end": 1.8, "type": "word"}, + {"text": "beta.", "start": 1.8, "end": 3.0, "type": "word"}, + # silence gap + {"text": "gamma", "start": 8.0, "end": 8.8, "type": "word"}, + {"text": "delta.", "start": 8.8, "end": 10.0, "type": "word"}, + # silence gap + {"text": "epsilon", "start": 18.0, "end": 18.8, "type": "word"}, + {"text": "zeta.", "start": 18.8, "end": 20.0, "type": "word"}, + ]) + + assignments = rec.recommend( + script_srt=srt, transcript=transcript, source=synth_av, + output=plan, + ) + assert len(assignments) == 3 + assert all(a.cand is not None for a in assignments) + + # Render via the existing pipeline + ffmpeg_version = sde.preflight()["ffmpeg"] + job = sde.Job( + source=synth_av, + srt=srt, plan=plan, + voice=None, bg_volume=0.0, + tolerance=0.5, trim_direction="tail", on_short="error", + style="auto", fontsdir=None, + output=out_mp4, + name="e2e", + no_cache=False, keep_intermediates=False, no_overwrite=False, + ) + qc = sde.run_job(job, ffmpeg_version) + assert qc["ok"] is True + assert out_mp4.exists() + assert abs(qc["duration"]["drift_ms"]) <= 200 diff --git a/tests/test_run_episodes.py b/tests/test_run_episodes.py new file mode 100644 index 0000000..612b523 --- /dev/null +++ b/tests/test_run_episodes.py @@ -0,0 +1,251 @@ +"""Tests for the multi-episode batch runner.""" + +from __future__ import annotations + +import shutil +from pathlib import Path + +import pytest + + +@pytest.fixture +def runner(): + import run_episodes + return run_episodes + + +@pytest.fixture +def ffmpeg_version(helpers_ns): + return helpers_ns.sde.preflight()["ffmpeg"] + + +def _make_ep(ep_dir: Path, source: Path, helpers_ns, *, + cues=None, plan=None, voice: Path | None = None) -> None: + ep_dir.mkdir(parents=True, exist_ok=True) + shutil.copy2(source, ep_dir / "source.mp4") + helpers_ns.write_srt(ep_dir / "script.srt", cues or [ + (1, 0.0, 1.5, "alpha"), + (2, 1.5, 3.0, "beta"), + ]) + helpers_ns.write_plan_form_a(ep_dir / "edit_plan.json", plan or [ + (1, 1.0, 2.5), + (2, 5.0, 6.5), + ]) + if voice is not None: + shutil.copy2(voice, ep_dir / "voice.wav") + + +# --------------------------------------------------------------------------- +# 1. Discovery: pick up complete dirs, skip incomplete ones +# --------------------------------------------------------------------------- + + +def test_discover_skips_incomplete_dirs(runner, helpers_ns, synth_av, tmp_path): + batch = tmp_path / "batch" + _make_ep(batch / "ep01", synth_av, helpers_ns) + _make_ep(batch / "ep02", synth_av, helpers_ns) + # incomplete: missing edit_plan.json + bad = batch / "ep03" + bad.mkdir(parents=True) + shutil.copy2(synth_av, bad / "source.mp4") + helpers_ns.write_srt(bad / "script.srt", [(1, 0.0, 1.5, "x")]) + # not a dir at all + (batch / "stray.txt").write_text("ignore me", encoding="utf-8") + + eps = runner.discover_episodes(batch) + names = [e.name for e in eps] + assert names == ["ep01", "ep02"] + + +def test_discover_sees_voice_wav_if_present( + runner, helpers_ns, synth_av, synth_voice, tmp_path +): + batch = tmp_path / "batch" + _make_ep(batch / "ep01", synth_av, helpers_ns) + _make_ep(batch / "ep02", synth_av, helpers_ns, voice=synth_voice) + + eps = runner.discover_episodes(batch) + by_name = {e.name: e for e in eps} + assert by_name["ep01"].voice is None + assert by_name["ep02"].voice is not None and by_name["ep02"].voice.is_file() + + +def test_discover_hard_fails_on_empty_root(runner, tmp_path): + batch = tmp_path / "empty" + batch.mkdir() + with pytest.raises(SystemExit) as exc: + runner.discover_episodes(batch) + assert "no usable" in str(exc.value) + + +# --------------------------------------------------------------------------- +# 2. End-to-end: 3 eps run sequentially, each produces final.mp4 +# --------------------------------------------------------------------------- + + +def test_run_episodes_e2e(runner, helpers_ns, ffmpeg_version, synth_av, tmp_path): + batch = tmp_path / "batch" + for name in ("ep01", "ep02", "ep03"): + _make_ep(batch / name, synth_av, helpers_ns) + + summary = runner.run_episodes(batch, ffmpeg_version=ffmpeg_version) + + assert summary["episodes_total"] == 3 + assert summary["ok"] == 3 + for name in ("ep01", "ep02", "ep03"): + final = batch / name / "final.mp4" + assert final.exists(), f"{name}/final.mp4 missing" + + # Summary artifact + summary_file = batch / "run_episodes_summary.json" + assert summary_file.exists() + + +# --------------------------------------------------------------------------- +# 3. continue-on-error skips a broken ep, finishes the rest +# --------------------------------------------------------------------------- + + +def test_run_episodes_continue_on_error( + runner, helpers_ns, ffmpeg_version, synth_av, tmp_path +): + batch = tmp_path / "batch" + _make_ep(batch / "ep01", synth_av, helpers_ns) + # ep02: range exceeds the synth source (30s) — pre-extract range check fires + _make_ep(batch / "ep02", synth_av, helpers_ns, + plan=[(1, 1.0, 2.5), (2, 60.0, 61.5)]) + _make_ep(batch / "ep03", synth_av, helpers_ns) + + summary = runner.run_episodes( + batch, ffmpeg_version=ffmpeg_version, + continue_on_error=True, + ) + assert summary["episodes_total"] == 3 + assert summary["ok"] == 2 + # ep01 + ep03 produced output, ep02 did not + assert (batch / "ep01" / "final.mp4").exists() + assert not (batch / "ep02" / "final.mp4").exists() + assert (batch / "ep03" / "final.mp4").exists() + + +def test_run_episodes_aborts_without_continue_on_error( + runner, helpers_ns, ffmpeg_version, synth_av, tmp_path +): + batch = tmp_path / "batch" + _make_ep(batch / "ep01", synth_av, helpers_ns) + _make_ep(batch / "ep02", synth_av, helpers_ns, + plan=[(1, 1.0, 2.5), (2, 60.0, 61.5)]) # bad + _make_ep(batch / "ep03", synth_av, helpers_ns) + + with pytest.raises(SystemExit): + runner.run_episodes(batch, ffmpeg_version=ffmpeg_version) + # ep03 was never reached + assert not (batch / "ep03" / "final.mp4").exists() + + +# --------------------------------------------------------------------------- +# 4. Per-ep voice.wav becomes a global voice for that ep +# --------------------------------------------------------------------------- + + +def test_run_episodes_failure_record_includes_paths( + runner, helpers_ns, ffmpeg_version, synth_av, tmp_path +): + """When --continue-on-error skips an ep, the record must carry enough + context to triage without re-reading the terminal.""" + batch = tmp_path / "batch" + _make_ep(batch / "ep01", synth_av, helpers_ns) + _make_ep(batch / "ep02", synth_av, helpers_ns, + plan=[(1, 1.0, 2.5), (2, 60.0, 61.5)]) # range overruns 30s synth + _make_ep(batch / "ep03", synth_av, helpers_ns) + + summary = runner.run_episodes( + batch, ffmpeg_version=ffmpeg_version, + continue_on_error=True, + ) + failed = [r for r in summary["results"] if not r.get("ok")] + assert len(failed) == 1 + rec = failed[0] + assert rec["job"] == "ep02" + assert rec["index"] == 1 + assert rec["srt"].endswith("script.srt") + assert rec["plan"].endswith("edit_plan.json") + assert rec["source"].endswith("source.mp4") + assert rec["output"].endswith("final.mp4") + assert rec["error"] + # Pre-extract range-bounds check → no ffmpeg → empty stderr + assert rec["stderr_tail"] == "" + + +def test_run_episodes_continues_past_corrupt_plan_json( + runner, helpers_ns, ffmpeg_version, synth_av, tmp_path +): + """A non-SystemExit (JSONDecodeError) inside run_job must NOT abort + --continue-on-error. Pre-fix the loop only caught SystemExit, so a + malformed edit_plan.json in one ep would crash the whole batch.""" + batch = tmp_path / "batch" + _make_ep(batch / "ep01", synth_av, helpers_ns) + # ep02: valid SRT + source, but plan.json is garbage + ep02 = batch / "ep02" + ep02.mkdir() + import shutil + shutil.copy2(synth_av, ep02 / "source.mp4") + helpers_ns.write_srt(ep02 / "script.srt", [(1, 0.0, 1.5, "x")]) + (ep02 / "edit_plan.json").write_text("{ not json", encoding="utf-8") + # ep03 should still run + _make_ep(batch / "ep03", synth_av, helpers_ns) + + summary = runner.run_episodes( + batch, ffmpeg_version=ffmpeg_version, + continue_on_error=True, + ) + assert summary["episodes_total"] == 3 + assert summary["ok"] == 2 + + failed = [r for r in summary["results"] if not r.get("ok")] + assert len(failed) == 1 and failed[0]["job"] == "ep02" + assert "JSON" in failed[0]["error"] or "json" in failed[0]["error"] + # ep03 (the post-bad one) must have run + assert (batch / "ep03" / "final.mp4").exists() + + +def test_run_episodes_extract_mode( + runner, helpers_ns, ffmpeg_version, synth_av, tmp_path +): + """--mode extract across multiple eps: each ep produces clip_*.mp4 in its + own edit/extracted_clips_/ and NOT a final.mp4.""" + batch = tmp_path / "batch" + for name in ("ep01", "ep02"): + _make_ep(batch / name, synth_av, helpers_ns) + + summary = runner.run_episodes( + batch, ffmpeg_version=ffmpeg_version, mode="extract", + ) + + assert summary["episodes_total"] == 2 + assert summary["ok"] == 2 + for r in summary["results"]: + assert r["mode"] == "extract" + assert r["clip_count"] == 2 # CUES_2 has 2 cues + extracted_dir = Path(r["extracted_dir"]) + assert extracted_dir.is_dir() + assert (extracted_dir / "clip_001.mp4").exists() + assert (extracted_dir / "clip_002.mp4").exists() + # No final.mp4 in any ep dir + for name in ("ep01", "ep02"): + assert not (batch / name / "final.mp4").exists() + + +def test_run_episodes_per_ep_voice( + runner, helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path +): + batch = tmp_path / "batch" + _make_ep(batch / "ep01", synth_av, helpers_ns) + _make_ep(batch / "ep02", synth_av, helpers_ns, voice=synth_voice) + + summary = runner.run_episodes(batch, ffmpeg_version=ffmpeg_version) + + by_name = {r["job"]: r for r in summary["results"]} + assert by_name["ep01"]["audio"]["voice_used"] is False + assert by_name["ep02"]["audio"]["voice_used"] is True + assert by_name["ep02"]["audio"]["mode"] == "voice_replace" diff --git a/tests/test_srt_driven_batch.py b/tests/test_srt_driven_batch.py new file mode 100644 index 0000000..37733d5 --- /dev/null +++ b/tests/test_srt_driven_batch.py @@ -0,0 +1,325 @@ +"""Batch-manifest tests for srt_driven_edit. + +Exercises load_manifest + job_from_dict + run_job in the loop pattern that +the CLI uses, without depending on argv parsing. +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +import pytest + + +# --------------------------------------------------------------------------- +# Common cue/plan helpers used across batch jobs +# --------------------------------------------------------------------------- + +CUES_2 = [ + (1, 0.0, 2.0, "alpha"), + (2, 2.0, 4.0, "beta"), +] + +PLAN_2 = [ + (1, 1.0, 3.0), + (2, 5.0, 7.0), +] + + +def default_args_namespace() -> argparse.Namespace: + """Build the defaults Namespace job_from_dict expects.""" + return argparse.Namespace( + bg_volume=0.0, + tolerance=0.5, + trim_direction="tail", + on_short="error", + style="auto", + no_cache=False, + keep_intermediates=False, + no_overwrite=False, + ) + + +def run_batch(helpers_ns, manifest_path, ffmpeg_version, *, + continue_on_error: bool = False) -> list[dict]: + """Mirror the CLI's batch loop so we can unit-test it.""" + sde = helpers_ns.sde + defaults = default_args_namespace() + rows = sde.load_manifest(manifest_path) + results: list[dict] = [] + for i, row in enumerate(rows): + try: + job = sde.job_from_dict(row, defaults, manifest_path.parent, i) + except (SystemExit, Exception) as e: + if continue_on_error: + results.append(sde.make_failure_record( + index=i, name=row.get("name", f"row{i}"), + error=e, job=None, manifest_row=row, + )) + continue + raise + try: + results.append(sde.run_job(job, ffmpeg_version)) + except (SystemExit, Exception) as e: + if continue_on_error: + results.append(sde.make_failure_record( + index=i, name=job.name, error=e, job=job, + )) + continue + raise + return results + + +@pytest.fixture +def ffmpeg_version(helpers_ns) -> str: + return helpers_ns.sde.preflight()["ffmpeg"] + + +# --------------------------------------------------------------------------- +# 1. Two jobs same name, no output specified → auto-isolated outputs +# --------------------------------------------------------------------------- + + +def test_batch_auto_isolation(helpers_ns, ffmpeg_version, synth_av, tmp_path): + # Two SRTs / plans with distinct content but identical job name + for i in range(2): + srt = tmp_path / f"script_{i}.srt" + plan = tmp_path / f"plan_{i}.json" + helpers_ns.write_srt(srt, CUES_2) + helpers_ns.write_plan_form_a(plan, PLAN_2) + + manifest_path = tmp_path / "jobs.json" + manifest_path.write_text(json.dumps([ + {"name": "promo", # same name on purpose + "source": str(synth_av), + "srt": "script_0.srt", + "plan": "plan_0.json"}, + {"name": "promo", # collision + "source": str(synth_av), + "srt": "script_1.srt", + "plan": "plan_1.json"}, + ]), encoding="utf-8") + + results = run_batch(helpers_ns, manifest_path, ffmpeg_version) + assert len(results) == 2 + assert all(r["ok"] for r in results) + + out_paths = [Path(r["output_path"]) for r in results] + # auto-isolated → distinct + assert out_paths[0] != out_paths[1] + # Names should contain the index suffix + assert "_00" in out_paths[0].name + assert "_01" in out_paths[1].name + for p in out_paths: + assert p.exists() + + +# --------------------------------------------------------------------------- +# 2. continue-on-error skips a malformed row, finishes the rest +# --------------------------------------------------------------------------- + + +def test_batch_continue_on_error(helpers_ns, ffmpeg_version, synth_av, tmp_path): + # Three jobs: 0 ok, 1 has a missing 'plan' field, 2 ok + for i in (0, 2): + helpers_ns.write_srt(tmp_path / f"s{i}.srt", CUES_2) + helpers_ns.write_plan_form_a(tmp_path / f"p{i}.json", PLAN_2) + + manifest_path = tmp_path / "jobs.json" + manifest_path.write_text(json.dumps([ + {"name": "ok0", "source": str(synth_av), + "srt": "s0.srt", "plan": "p0.json"}, + {"name": "broken", "source": str(synth_av), + "srt": "s_missing.srt"}, # no plan, srt also missing + {"name": "ok2", "source": str(synth_av), + "srt": "s2.srt", "plan": "p2.json"}, + ]), encoding="utf-8") + + results = run_batch(helpers_ns, manifest_path, ffmpeg_version, + continue_on_error=True) + assert len(results) == 3 + assert results[0]["ok"] is True + assert results[1]["ok"] is False and "error" in results[1] + assert results[2]["ok"] is True + + +def test_batch_aborts_without_continue_on_error( + helpers_ns, ffmpeg_version, synth_av, tmp_path +): + helpers_ns.write_srt(tmp_path / "s0.srt", CUES_2) + helpers_ns.write_plan_form_a(tmp_path / "p0.json", PLAN_2) + + manifest_path = tmp_path / "jobs.json" + manifest_path.write_text(json.dumps([ + {"name": "ok0", "source": str(synth_av), + "srt": "s0.srt", "plan": "p0.json"}, + {"name": "broken", "source": str(synth_av), + "srt": "s_missing.srt"}, # no plan + ]), encoding="utf-8") + + with pytest.raises(SystemExit): + run_batch(helpers_ns, manifest_path, ffmpeg_version, + continue_on_error=False) + + +# --------------------------------------------------------------------------- +# 3. CSV manifest is supported +# --------------------------------------------------------------------------- + + +def test_batch_csv_manifest(helpers_ns, ffmpeg_version, synth_av, tmp_path): + helpers_ns.write_srt(tmp_path / "s.srt", CUES_2) + helpers_ns.write_plan_form_a(tmp_path / "p.json", PLAN_2) + + manifest = tmp_path / "jobs.csv" + manifest.write_text( + "name,source,srt,plan,bg_volume\n" + f"promo,{synth_av},s.srt,p.json,0.0\n", + encoding="utf-8", + ) + results = run_batch(helpers_ns, manifest, ffmpeg_version) + assert len(results) == 1 and results[0]["ok"] is True + + +# --------------------------------------------------------------------------- +# 4. Different bg_volume per job is honored (cache must NOT collide) +# --------------------------------------------------------------------------- + + +def test_run_ff_raises_pipeline_error_with_stderr(helpers_ns, tmp_path): + """run_ff must raise PipelineError carrying a non-empty stderr tail.""" + sde = helpers_ns.sde + out = tmp_path / "out.mp4" + bogus = tmp_path / "definitely_missing.mp4" + with pytest.raises(sde.PipelineError) as exc: + sde.run_ff( + ["ffmpeg", "-y", "-hide_banner", "-i", str(bogus), str(out)], + "intentional failure", + ) + # Subclass of SystemExit → existing handlers keep working + assert isinstance(exc.value, SystemExit) + assert exc.value.stderr_tail, "stderr_tail should be populated on ffmpeg failure" + # The stderr from ffmpeg complaining about a missing input should mention it + assert "definitely_missing.mp4" in exc.value.stderr_tail \ + or "No such file" in exc.value.stderr_tail + + +def test_batch_failure_record_includes_paths( + helpers_ns, ffmpeg_version, synth_av, tmp_path +): + """A failed batch row must carry index/srt/plan/source/output for triage.""" + helpers_ns.write_srt(tmp_path / "s_ok.srt", CUES_2) + helpers_ns.write_plan_form_a(tmp_path / "p_ok.json", PLAN_2) + helpers_ns.write_srt(tmp_path / "s_bad.srt", CUES_2) + # out-of-bounds range (synth_av is 30s; 60s exceeds it) — fails in pre-flight, + # no ffmpeg invocation → stderr_tail should stay empty. + helpers_ns.write_plan_form_a(tmp_path / "p_bad.json", + [(1, 1.0, 3.0), (2, 60.0, 62.0)]) + + manifest_path = tmp_path / "jobs.json" + manifest_path.write_text(json.dumps([ + {"name": "ok", "source": str(synth_av), + "srt": "s_ok.srt", "plan": "p_ok.json"}, + {"name": "bad", "source": str(synth_av), + "srt": "s_bad.srt", "plan": "p_bad.json"}, + ]), encoding="utf-8") + + results = run_batch(helpers_ns, manifest_path, ffmpeg_version, + continue_on_error=True) + assert len(results) == 2 and results[0]["ok"] is True + failed = results[1] + assert failed["ok"] is False + assert failed["job"] == "bad" + assert failed["index"] == 1 + assert failed["srt"] and failed["srt"].endswith("s_bad.srt") + assert failed["plan"] and failed["plan"].endswith("p_bad.json") + assert failed["source"] == str(synth_av) + assert failed["output"] and failed["output"].endswith(".mp4") + assert failed["error"] + # Range-bounds check fires before any ffmpeg → no stderr + assert failed["stderr_tail"] == "" + + +def test_batch_malformed_row_failure_record(helpers_ns, ffmpeg_version, tmp_path): + """A row that fails inside job_from_dict still gets a usable record. + + No Job was ever constructed, so paths come from the raw manifest row. + """ + manifest_path = tmp_path / "jobs.json" + manifest_path.write_text(json.dumps([ + {"name": "broken", + "source": "raw/take.mp4", + "srt": "scripts/missing.srt"}, # no `plan` field + ]), encoding="utf-8") + + results = run_batch(helpers_ns, manifest_path, ffmpeg_version, + continue_on_error=True) + assert len(results) == 1 + failed = results[0] + assert failed["ok"] is False + assert failed["job"] == "broken" + assert failed["index"] == 0 + # Source / SRT come from the row dict because Job construction never completed + assert failed["source"] == "raw/take.mp4" + assert failed["srt"] == "scripts/missing.srt" + assert failed["plan"] is None + assert failed["stderr_tail"] == "" + + +def test_batch_continues_past_corrupt_plan_json( + helpers_ns, ffmpeg_version, synth_av, tmp_path +): + """A row whose plan.json is malformed must NOT abort the batch under + --continue-on-error. JSONDecodeError used to escape the loop because + we only caught SystemExit; the failure record now captures it. + """ + # Good row + helpers_ns.write_srt(tmp_path / "s_ok.srt", CUES_2) + helpers_ns.write_plan_form_a(tmp_path / "p_ok.json", PLAN_2) + # Bad plan: not valid JSON + helpers_ns.write_srt(tmp_path / "s_bad.srt", CUES_2) + (tmp_path / "p_bad.json").write_text("{ this is not json", encoding="utf-8") + # Another good row after the bad one — must still run + helpers_ns.write_srt(tmp_path / "s_ok2.srt", CUES_2) + helpers_ns.write_plan_form_a(tmp_path / "p_ok2.json", PLAN_2) + + manifest_path = tmp_path / "jobs.json" + manifest_path.write_text(json.dumps([ + {"name": "ok0", "source": str(synth_av), + "srt": "s_ok.srt", "plan": "p_ok.json"}, + {"name": "broken", "source": str(synth_av), + "srt": "s_bad.srt", "plan": "p_bad.json"}, + {"name": "ok2", "source": str(synth_av), + "srt": "s_ok2.srt", "plan": "p_ok2.json"}, + ]), encoding="utf-8") + + results = run_batch(helpers_ns, manifest_path, ffmpeg_version, + continue_on_error=True) + assert len(results) == 3 + assert results[0]["ok"] is True + assert results[1]["ok"] is False + assert "JSON" in results[1]["error"] or "json" in results[1]["error"] + assert results[1]["plan"] and results[1]["plan"].endswith("p_bad.json") + assert results[2]["ok"] is True + + +def test_batch_per_job_bg_volume(helpers_ns, ffmpeg_version, synth_av, tmp_path): + helpers_ns.write_srt(tmp_path / "s.srt", CUES_2) + helpers_ns.write_plan_form_a(tmp_path / "p.json", PLAN_2) + + manifest = tmp_path / "jobs.json" + manifest.write_text(json.dumps([ + {"name": "silent", "source": str(synth_av), + "srt": "s.srt", "plan": "p.json", "bg_volume": 0.0}, + {"name": "bg10", "source": str(synth_av), + "srt": "s.srt", "plan": "p.json", "bg_volume": 0.1}, + ]), encoding="utf-8") + + results = run_batch(helpers_ns, manifest, ffmpeg_version) + assert len(results) == 2 and all(r["ok"] for r in results) + assert results[0]["audio"]["mode"] == "silent" + assert results[1]["audio"]["mode"] == "original_only" + # bg10 should NOT have hit cache from silent (different effective_bg → different key) + assert all(s["cached"] is False for s in results[1]["segments"]) diff --git a/tests/test_srt_driven_e2e.py b/tests/test_srt_driven_e2e.py new file mode 100644 index 0000000..ea5f05e --- /dev/null +++ b/tests/test_srt_driven_e2e.py @@ -0,0 +1,441 @@ +"""End-to-end tests for srt_driven_edit. + +Each test crafts an SRT + plan file inside tmp_path, runs run_job against +the session-scoped synthetic source video, and verifies output existence, +duration accuracy (within 200ms), and QC report contents. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + + +# --------------------------------------------------------------------------- +# Test helpers +# --------------------------------------------------------------------------- + +DEFAULT_CUES = [ + (1, 0.0, 2.0, "first cue"), + (2, 2.0, 4.5, "second cue"), + (3, 6.0, 8.5, "third cue with leading gap"), # 1.5s gap before this +] + +DEFAULT_PLAN = [ + (1, 1.0, 3.0), # 2.0s from source[1.0-3.0] + (2, 5.0, 7.5), # 2.5s + (3, 10.0, 12.5), # 2.5s +] + + +def make_job(helpers_ns, srt_path, plan_path, tmp_path, *, + source=None, voice=None, bg_volume=0.0, + style="auto", no_overwrite=False, output=None, + mode="full"): + sde = helpers_ns.sde + return sde.Job( + source=source, + srt=srt_path, + plan=plan_path, + voice=voice, + bg_volume=bg_volume, + tolerance=0.5, + trim_direction="tail", + on_short="error", + style=style, + fontsdir=None, + output=output or (tmp_path / "out.mp4"), + name=srt_path.stem, + no_cache=False, + keep_intermediates=False, + no_overwrite=no_overwrite, + mode=mode, + ) + + +@pytest.fixture +def ffmpeg_version(helpers_ns) -> str: + return helpers_ns.sde.preflight()["ffmpeg"] + + +# --------------------------------------------------------------------------- +# 1. Basic e2e: source.mp4 + 3 cues → final has expected duration +# --------------------------------------------------------------------------- + + +def test_basic_single_job(helpers_ns, ffmpeg_version, synth_av, tmp_path): + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av) + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + assert qc["ok"] is True + assert qc["duration"]["expected_s"] == 8.5 + assert abs(qc["duration"]["drift_ms"]) <= 200 + assert (tmp_path / "out.mp4").exists() + assert qc["audio"]["mode"] == "silent" # bg_volume=0, no voice + + +# --------------------------------------------------------------------------- +# 2. GB18030 SRT input — encoding fallback must let the pipeline complete +# --------------------------------------------------------------------------- + + +def test_gbk_srt_input(helpers_ns, ffmpeg_version, synth_av, tmp_path): + srt = tmp_path / "script_gbk.srt" + plan = tmp_path / "plan.json" + cjk_cues = [ + (1, 0.0, 2.0, "第一条"), + (2, 2.0, 4.5, "第二条"), + (3, 6.0, 8.5, "第三条 含 gap"), + ] + helpers_ns.write_srt(srt, cjk_cues, encoding="gb18030") + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av, style="auto") + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + assert qc["ok"] is True + assert "Microsoft YaHei UI" in qc["subtitles"]["force_style"], \ + "auto style should pick cjk-natural when SRT contains CJK" + + +# --------------------------------------------------------------------------- +# 3. CJK in output path — work_dir + ensure_safe_subs_path must save us +# --------------------------------------------------------------------------- + + +def test_cjk_in_output_path(helpers_ns, ffmpeg_version, synth_av, tmp_path): + cjk_dir = tmp_path / "中文 目录" + cjk_dir.mkdir() + srt = cjk_dir / "字幕.srt" + plan = cjk_dir / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + out = cjk_dir / "成片.mp4" + job = make_job(helpers_ns, srt, plan, tmp_path, + source=synth_av, output=out) + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + assert qc["ok"] is True + assert out.exists() + + +# --------------------------------------------------------------------------- +# 4. Per-segment voice — audio.mode should reflect voice usage +# --------------------------------------------------------------------------- + + +def test_per_segment_voice(helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path): + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + + helpers_ns.write_plan_form_b( + plan, + sources={"A": str(synth_av)}, + voices={"v1": str(synth_voice)}, + segments=[ + {"id": 1, "source": "A", "source_start": "00:00:01,000", + "source_end": "00:00:03,000", "voice": "v1"}, + {"id": 2, "source": "A", "source_start": "00:00:05,000", + "source_end": "00:00:07,500"}, + {"id": 3, "source": "A", "source_start": "00:00:10,000", + "source_end": "00:00:12,500"}, + ], + ) + + job = make_job(helpers_ns, srt, plan, tmp_path) # source=None — Form B + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + assert qc["ok"] is True + assert qc["audio"]["voice_used"] is True + assert qc["audio"]["mode"] == "voice_replace" # bg_volume == 0 + + +# --------------------------------------------------------------------------- +# 5. Video-only source + bg_volume > 0 → auto-degrade, no crash +# --------------------------------------------------------------------------- + + +def test_video_only_source_with_bg_volume( + helpers_ns, ffmpeg_version, synth_v_only, tmp_path, capsys +): + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, + source=synth_v_only, bg_volume=0.5) + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + captured = capsys.readouterr() + assert "no audio track" in captured.out, \ + "expected a WARNING about source having no audio" + assert qc["ok"] is True + + +# --------------------------------------------------------------------------- +# 6. Source range out of bounds → SystemExit before extraction +# --------------------------------------------------------------------------- + + +def test_range_out_of_bounds_fails_fast( + helpers_ns, ffmpeg_version, synth_av, tmp_path +): + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + # source is 30s, but ask for 0:50 — way over + helpers_ns.write_plan_form_a(plan, [ + (1, 1.0, 3.0), + (2, 5.0, 7.5), + (3, 50.0, 52.5), # bad + ]) + + job = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av) + with pytest.raises(SystemExit) as exc: + helpers_ns.sde.run_job(job, ffmpeg_version) + assert "exceeds source" in str(exc.value) + # And the failure happened pre-extract, so no out.mp4 + assert not (tmp_path / "out.mp4").exists() + + +# --------------------------------------------------------------------------- +# 7. Second run hits cache for every segment +# --------------------------------------------------------------------------- + + +def test_cache_hit_on_rerun(helpers_ns, ffmpeg_version, synth_av, tmp_path): + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av) + qc1 = helpers_ns.sde.run_job(job, ffmpeg_version) + qc2 = helpers_ns.sde.run_job(job, ffmpeg_version) + + assert all(s["cached"] is False for s in qc1["segments"]) + assert all(s["cached"] is True for s in qc2["segments"]) + # Cache hits should be measurably faster + assert qc2["elapsed_s"] <= qc1["elapsed_s"] + + +# --------------------------------------------------------------------------- +# 8. --no-overwrite refuses to clobber existing output +# --------------------------------------------------------------------------- + + +def test_no_overwrite_refuses(helpers_ns, ffmpeg_version, synth_av, tmp_path): + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job1 = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av) + helpers_ns.sde.run_job(job1, ffmpeg_version) + + job2 = make_job(helpers_ns, srt, plan, tmp_path, + source=synth_av, no_overwrite=True) + with pytest.raises(SystemExit) as exc: + helpers_ns.sde.run_job(job2, ffmpeg_version) + assert "no-overwrite" in str(exc.value) + + +# --------------------------------------------------------------------------- +# 9. SRT gap → output duration includes the gap as black+silent +# --------------------------------------------------------------------------- + + +def test_global_voice_spans_timeline( + helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path +): + """Global --voice must span the WHOLE output timeline, not restart per segment. + + Regression: earlier implementation expanded --voice into a synthetic + per-segment voice on every entry, which made each segment apad/atrim + voice.wav from t=0 — so a 5s voice would replay at every cut. The fix + moves global-voice mixing into the final compose step where voice is + apad'd / atrim'd to total_duration once. + """ + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) # total 8.5s + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, + source=synth_av, voice=synth_voice) + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + assert qc["ok"] is True + assert qc["audio"]["voice_used"] is True + assert qc["audio"]["mode"] == "voice_replace" + assert qc["audio"]["bg_volume"] == 0.0 + # Per-segment voice slot must be None — proves we are NOT smuggling the + # global voice in via the per-segment expansion hack. + assert all(s["voice"] is None for s in qc["segments"]) + + # Output duration matches SRT total (voice apad'd from 5s → 8.5s) + actual = helpers_ns.sde.probe_duration(tmp_path / "out.mp4") + assert abs(actual - 8.5) < 0.25, f"actual {actual}s vs expected 8.5s" + + +def test_global_voice_with_bg_volume_mix( + helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path +): + """With bg_volume>0 and global voice, base audio (source*bg) is mixed + under voice. The bg_volume is applied ONCE at extract; the final compose + must not re-scale it. + """ + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, + source=synth_av, voice=synth_voice, bg_volume=0.1) + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + assert qc["ok"] is True + assert qc["audio"]["mode"] == "voice_mix" + assert qc["audio"]["bg_volume"] == 0.1 + + +def test_global_voice_cache_independence( + helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path +): + """Segment cache must NOT depend on the global voice file. Running once + without voice then again with voice should reuse all segment caches — + voice gets mixed in the final pass, segments are identical. + """ + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job_no_voice = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av) + qc1 = helpers_ns.sde.run_job(job_no_voice, ffmpeg_version) + + job_with_voice = make_job( + helpers_ns, srt, plan, tmp_path, + source=synth_av, voice=synth_voice, + output=tmp_path / "out_voiced.mp4", + ) + qc2 = helpers_ns.sde.run_job(job_with_voice, ffmpeg_version) + + assert all(s["cached"] is False for s in qc1["segments"]), \ + "first run should not have cache hits" + assert all(s["cached"] is True for s in qc2["segments"]), \ + "second run with global voice should hit segment cache — voice is " \ + "mixed in the final pass, not baked into segments" + + +def test_extract_mode_stops_after_clips( + helpers_ns, ffmpeg_version, synth_av, tmp_path +): + """--mode extract must produce per-cue clips and NOT a concat'd final.mp4.""" + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, + source=synth_av, mode="extract") + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + # Extract-mode result shape differs from the QC report + assert qc["ok"] is True + assert qc["mode"] == "extract" + assert qc["clip_count"] == 3 + extracted_dir = Path(qc["extracted_dir"]) + assert extracted_dir.is_dir() + + # Clips renamed to clip_.mp4 (matches srt_video_editor convention) + for cid in (1, 2, 3): + clip = extracted_dir / f"clip_{cid:03d}.mp4" + assert clip.is_file(), f"missing extracted clip: {clip}" + # Each clip should match its cue duration within encoder rounding + actual = helpers_ns.sde.probe_duration(clip) + expected = next(c for c in DEFAULT_CUES if c[0] == cid) + expected_dur = expected[2] - expected[1] + assert abs(actual - expected_dur) < 0.25, \ + f"clip {cid}: actual {actual}s vs expected {expected_dur}s" + + # And NO final.mp4 was produced — extract mode stopped early + assert not (tmp_path / "out.mp4").exists() + + +def test_extract_mode_skips_gap_clips( + helpers_ns, ffmpeg_version, synth_av, tmp_path +): + """In extract mode, the synthetic black+silence gap clips are not made — + only real source extractions land in extracted_clips_/.""" + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + # Cues with a 1.5s gap between id=2 and id=3 (final_end=4.5, final_start=6.0) + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, + source=synth_av, mode="extract") + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + extracted_dir = Path(qc["extracted_dir"]) + # Only 3 clips (one per cue) — no gap_*.mp4 sneaks in + files = sorted(p.name for p in extracted_dir.iterdir()) + assert files == ["clip_001.mp4", "clip_002.mp4", "clip_003.mp4"] + + +def test_extract_mode_cleans_stale_clips( + helpers_ns, ffmpeg_version, synth_av, tmp_path +): + """A previous extract-mode run's stale clips must be removed before this + run writes its own. Otherwise leftover clip_999.mp4 would pollute the dir. + """ + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, + source=synth_av, mode="extract") + qc1 = helpers_ns.sde.run_job(job, ffmpeg_version) + extracted_dir = Path(qc1["extracted_dir"]) + + # Plant a stale clip + an unrelated note file + (extracted_dir / "clip_998.mp4").write_bytes(b"stale") + (extracted_dir / "notes.txt").write_text("user notes", encoding="utf-8") + + qc2 = helpers_ns.sde.run_job(job, ffmpeg_version) + files = sorted(p.name for p in Path(qc2["extracted_dir"]).iterdir()) + assert "clip_998.mp4" not in files, "stale clip should have been removed" + assert "notes.txt" in files, "non-clip user files must be preserved" + + +def test_gap_inserted_in_output(helpers_ns, ffmpeg_version, synth_av, tmp_path): + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + # 2 cues with a 1.5s gap between them: total output = 2 + 1.5 + 2.5 = 6.0s + cues = [ + (1, 0.0, 2.0, "first"), + (2, 3.5, 6.0, "second after gap"), + ] + helpers_ns.write_srt(srt, cues) + helpers_ns.write_plan_form_a(plan, [(1, 1.0, 3.0), (2, 5.0, 7.5)]) + + job = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av) + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + assert qc["ok"] is True + assert qc["duration"]["expected_s"] == 6.0 + assert abs(qc["duration"]["drift_ms"]) <= 200 + # ffprobe the actual output to double-check + actual = helpers_ns.sde.probe_duration(tmp_path / "out.mp4") + assert abs(actual - 6.0) < 0.25, f"actual {actual}s, expected 6.0s" diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py new file mode 100644 index 0000000..8ce55cd --- /dev/null +++ b/tests/test_transcribe.py @@ -0,0 +1,126 @@ +"""Unit tests for transcribe.py — only the pure conversion logic. + +API calls require a live DashScope key and external network; those are +intentionally out of scope here. Run an end-to-end smoke manually: + + python helpers/transcribe.py path/to/clip.mp4 --language zh +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT / "helpers")) + + +@pytest.fixture +def transcribe_mod(): + import transcribe as t + return t + + +def test_convert_basic_sentence(transcribe_mod): + """One sentence with two words gets flattened into Scribe-shaped words[].""" + sentences = [ + { + "begin_time": 0, + "end_time": 1500, + "text": "你好世界", + "words": [ + {"begin_time": 0, "end_time": 500, "text": "你好", "punctuation": ""}, + {"begin_time": 500, "end_time": 1500, "text": "世界", "punctuation": "。"}, + ], + } + ] + out = transcribe_mod._convert_dashscope_to_scribe(sentences, language_hint="zh") + assert out["language_code"] == "zh" + assert out["_source"].startswith("dashscope-") + assert len(out["words"]) == 2 + assert out["words"][0] == {"text": "你好", "start": 0.0, "end": 0.5, "type": "word"} + # Punctuation gets folded into the preceding word's text + assert out["words"][1] == {"text": "世界。", "start": 0.5, "end": 1.5, "type": "word"} + + +def test_convert_drops_empty_text(transcribe_mod): + """Whitespace-only / empty word entries are skipped, not emitted as junk.""" + sentences = [ + {"words": [ + {"begin_time": 0, "end_time": 100, "text": ""}, + {"begin_time": 100, "end_time": 200, "text": " "}, + {"begin_time": 200, "end_time": 400, "text": "hello"}, + ]} + ] + out = transcribe_mod._convert_dashscope_to_scribe(sentences, language_hint=None) + assert len(out["words"]) == 1 + assert out["words"][0]["text"] == "hello" + # No language hint → "auto" + assert out["language_code"] == "auto" + + +def test_convert_multiple_sentences(transcribe_mod): + """Words from multiple sentences flatten into a single ordered list.""" + sentences = [ + {"words": [ + {"begin_time": 0, "end_time": 500, "text": "first"}, + ]}, + {"words": [ + {"begin_time": 1000, "end_time": 1500, "text": "second"}, + {"begin_time": 1500, "end_time": 2000, "text": "third"}, + ]}, + ] + out = transcribe_mod._convert_dashscope_to_scribe(sentences, language_hint="en") + assert [w["text"] for w in out["words"]] == ["first", "second", "third"] + assert out["words"][0]["start"] == 0.0 + assert out["words"][-1]["end"] == 2.0 + + +def test_convert_tolerates_missing_or_bad_timestamps(transcribe_mod): + """A word with non-numeric timestamps is skipped rather than crashing + the whole conversion.""" + sentences = [ + {"words": [ + {"begin_time": "bad", "end_time": 500, "text": "junk"}, + {"begin_time": 0, "end_time": 500, "text": "good"}, + ]} + ] + out = transcribe_mod._convert_dashscope_to_scribe(sentences, language_hint=None) + assert [w["text"] for w in out["words"]] == ["good"] + + +def test_convert_empty_input(transcribe_mod): + """Empty / None input returns a structurally valid envelope with no words.""" + out = transcribe_mod._convert_dashscope_to_scribe([], language_hint=None) + assert out["words"] == [] + assert "language_code" in out and "_source" in out + + out_none = transcribe_mod._convert_dashscope_to_scribe(None, language_hint=None) + assert out_none["words"] == [] + + +def test_output_shape_compatible_with_recommender(transcribe_mod, tmp_path): + """Conversion produces JSON that recommend_edit_plan.load_transcript_words + can consume directly — this is the cross-module contract we promise.""" + import json + import recommend_edit_plan as rec + + sentences = [ + {"words": [ + {"begin_time": 1000, "end_time": 1500, "text": "hello", "punctuation": ""}, + {"begin_time": 1500, "end_time": 2000, "text": "world", "punctuation": "."}, + ]} + ] + transcript = transcribe_mod._convert_dashscope_to_scribe(sentences, language_hint="en") + + out_file = tmp_path / "transcript.json" + out_file.write_text(json.dumps(transcript, ensure_ascii=False), encoding="utf-8") + + words = rec.load_transcript_words(out_file) + assert len(words) == 2 + assert words[0]["text"] == "hello" + assert words[1]["text"] == "world." + assert words[0]["start"] == 1.0 + assert words[1]["end"] == 2.0