From 8738423ccae651959903a3d228324e09b66ae85c Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Tue, 19 May 2026 21:00:07 +0800 Subject: [PATCH 01/18] feat(srt_driven_edit): add SRT-driven edit pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Independent helper that assembles a final cut by aligning source ranges to an SRT timeline, bypassing the existing transcript-based EDL flow. Use when you have a finished script (script.srt = final captions timeline) and a list of source ranges keyed by SRT id. Pipeline: parse SRT + plan -> strict validate -> align -> extract segments (per-source ffprobe, HDR tone-map, sync tails, cache) -> gap clips for non-contiguous SRT cues -> lossless concat -> final pass with optional global voice mix + subtitle burn LAST (Hard Rule 1). Key correctness properties: - All intermediates land in a safe-ASCII temp work_dir; CJK / quoted user paths never reach libavfilter or the concat demuxer. - SRT input decoded with utf-8-sig / utf-8 / gb18030 / cp936 / cp1252 fallback; cue settings (position:90% etc.) tolerated. - Per-segment cache keyed by ffmpeg version + encoding params + effective bg_volume so encoder tweaks invalidate stale clips. - Source streams probed once; no-audio source auto-degrades bg_volume to 0 for its segments; out-of-bounds ranges fail fast. - Global --voice spans the whole timeline (apad/atrim to total_duration in the final compose), not per-segment — a 5s VO does not restart at every cut. - 30ms audio fades + fps=24,setpts and aresample sync tails on every segment prevent A/V drift through many short concats. - burn_subtitles is self-defending: unsafe subs paths are copied to a temp ASCII SRT before being fed to libavfilter. - Batch (jobs.json / .csv) auto-isolates outputs by manifest index; --continue-on-error skips failing rows; --no-overwrite refuses to clobber existing outputs. Includes examples (Form A array, Form B object with multi-source + voices, batch manifest, CJK SRT) and pytest coverage (14 e2e + batch tests using lavfi-synthesized media; passes against ffmpeg 8.x on Windows). Co-Authored-By: Claude Opus 4.7 --- examples/srt_driven/_smoke_test.py | 421 +++++++ examples/srt_driven/edit_plan.json | 17 + examples/srt_driven/edit_plan_v2.json | 32 + examples/srt_driven/jobs.json | 25 + examples/srt_driven/script.srt | 11 + examples/srt_driven/script_cjk.srt | 11 + helpers/srt_driven_edit.py | 1522 +++++++++++++++++++++++++ pyproject.toml | 1 + tests/conftest.py | 153 +++ tests/test_srt_driven_batch.py | 207 ++++ tests/test_srt_driven_e2e.py | 358 ++++++ 11 files changed, 2758 insertions(+) create mode 100644 examples/srt_driven/_smoke_test.py create mode 100644 examples/srt_driven/edit_plan.json create mode 100644 examples/srt_driven/edit_plan_v2.json create mode 100644 examples/srt_driven/jobs.json create mode 100644 examples/srt_driven/script.srt create mode 100644 examples/srt_driven/script_cjk.srt create mode 100644 helpers/srt_driven_edit.py create mode 100644 tests/conftest.py create mode 100644 tests/test_srt_driven_batch.py create mode 100644 tests/test_srt_driven_e2e.py diff --git a/examples/srt_driven/_smoke_test.py b/examples/srt_driven/_smoke_test.py new file mode 100644 index 0000000..88053e2 --- /dev/null +++ b/examples/srt_driven/_smoke_test.py @@ -0,0 +1,421 @@ +"""Regression tests for srt_driven_edit. Run with bare `python` — no pytest.""" + +import sys +import tempfile +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "helpers")) + +from srt_driven_edit import ( + parse_srt, parse_plan, align, validate_srt, validate_plan, + validate_alignment, resolve_style, has_cjk, STYLE_TEMPLATES, + subs_filter_escape, safe_ascii_name, + concat_quote_path, read_srt_text, make_safe_work_dir, + _split_time_line, V_SYNC_TAIL, A_SYNC_TAIL, SRT_ENCODINGS, + ensure_safe_subs_path, _path_is_filter_safe, + PARAMS_FINGERPRINT, CACHE_VERSION, cache_key, + Segment, +) + +base = Path(__file__).resolve().parent + + +def section(title: str) -> None: + print(f"\n=== {title} ===") + + +def ok(msg: str) -> None: + print(f" ok: {msg}") + + +def fail(msg: str) -> None: + raise SystemExit(f" FAIL: {msg}") + + +# -- 1. Legacy Form A + Form B parsing ----------------------------------- + +section("Form A (legacy array, English)") +cues = parse_srt(base / "script.srt") +validate_srt(cues) +sources, voices, entries = parse_plan(base / "edit_plan.json") +assert len(cues) == 3 and len(entries) == 3 and sources == {} and voices == {} +ok("parsed 3 cues + 3 plan entries, no sources/voices map") +assert not has_cjk(cues) +ok("has_cjk False on English") + +section("Form B (object, multi-source, multi-voice)") +sources, voices, entries = parse_plan(base / "edit_plan_v2.json") +assert list(sources) == ["A", "B"] and list(voices) == ["host", "guest"] +ok(f"sources={list(sources)} voices={list(voices)}") +assert entries[0].source_name == "A" and entries[0].voice_name == "host" +assert entries[1].source_name == "B" and entries[1].voice_name == "guest" +assert entries[2].source_name == "A" and entries[2].voice_name is None +ok("per-segment source/voice refs parsed") + + +# -- 2. CJK detection + auto style + style templates --------------------- + +section("CJK detection + style resolution") +cues_cjk = parse_srt(base / "script_cjk.srt") +assert has_cjk(cues_cjk) is True +assert not has_cjk(cues) +ok("CJK regex matches CN/EN correctly") +auto_cjk = resolve_style("auto", cues_cjk) +auto_en = resolve_style("auto", cues) +assert "Microsoft YaHei UI" in auto_cjk +assert "Helvetica" in auto_en +ok("auto style picks YaHei for CJK, Helvetica for EN") +assert STYLE_TEMPLATES["cjk-natural"] == resolve_style("cjk-natural", cues) +ok("named template lookup") +raw = "FontName=Custom,FontSize=24" +assert resolve_style(raw, cues) == raw +ok("raw ASS string passthrough") + + +# -- 3. SRT encoding fallback (GBK / utf-8-sig / utf-8) ------------------ + +section("read_srt_text encoding fallback") +tmp = Path(tempfile.mkdtemp(prefix="srt_smoke_")) + +cjk_payload = "1\n00:00:00,000 --> 00:00:03,000\n中文字幕测试\n" + +# utf-8 +(tmp / "u8.srt").write_bytes(cjk_payload.encode("utf-8")) +text = read_srt_text(tmp / "u8.srt") +assert "中文字幕测试" in text, f"utf-8 decode wrong: {text!r}" +ok("utf-8 decoded") + +# utf-8 with BOM +(tmp / "u8bom.srt").write_bytes(b"\xef\xbb\xbf" + cjk_payload.encode("utf-8")) +text = read_srt_text(tmp / "u8bom.srt") +assert text.startswith("1") and "中文" in text, f"utf-8-sig decode wrong: {text!r}" +ok("utf-8-sig BOM stripped + decoded") + +# gb18030 (typical Windows Chinese) +(tmp / "gb.srt").write_bytes(cjk_payload.encode("gb18030")) +text = read_srt_text(tmp / "gb.srt") +assert "中文字幕测试" in text, f"gb18030 decode wrong: {text!r}" +ok("gb18030 decoded via fallback") + +# cp936 (a.k.a. GBK, Windows Chinese ANSI) +(tmp / "cp936.srt").write_bytes(cjk_payload.encode("cp936")) +text = read_srt_text(tmp / "cp936.srt") +assert "中文字幕测试" in text +ok("cp936 decoded via fallback") + +# Now parse a GBK-encoded full SRT end-to-end +gbk_full = ( + "1\n00:00:00,000 --> 00:00:03,000\n这是第一条\n\n" + "2\n00:00:03,000 --> 00:00:06,000\n这是第二条\n" +) +gbk_path = tmp / "full_gbk.srt" +gbk_path.write_bytes(gbk_full.encode("gb18030")) +cues_gbk = parse_srt(gbk_path) +assert len(cues_gbk) == 2 +assert cues_gbk[0].text == "这是第一条" +assert cues_gbk[1].text == "这是第二条" +ok("parse_srt end-to-end on GB18030 input") + + +# -- 4. SRT cue settings tolerance --------------------------------------- + +section("Cue settings on time line") +# Real-world examples: 'position:90% align:start' on the right of --> +samples = [ + ("00:00:00,000 --> 00:00:03,000 position:90%", (0.0, 3.0)), + ("00:00:01,500 --> 00:00:04,200 align:start line:80%", (1.5, 4.2)), + (" 00:00:02,000 --> 00:00:05,000 X1:10 X2:200 Y1:5 Y2:50", (2.0, 5.0)), + ("00:00:00.500 --> 00:00:01.000", (0.5, 1.0)), # dot fraction +] +for line, expected in samples: + a, b = _split_time_line(line) + from srt_driven_edit import parse_timestamp + got = (parse_timestamp(a), parse_timestamp(b)) + assert abs(got[0] - expected[0]) < 1e-6 and abs(got[1] - expected[1]) < 1e-6, \ + f"{line!r} → {got}, expected {expected}" +ok(f"parsed {len(samples)} time lines with cue settings / odd spacing") + +# Full SRT with cue settings inline +weird_srt = ( + "1\n00:00:00,000 --> 00:00:03,000 position:90% align:start\nhello\n\n" + "2\n00:00:03,000 --> 00:00:07,000 line:80%\nworld\n" +) +weird = tmp / "weird.srt" +weird.write_text(weird_srt, encoding="utf-8") +parsed = parse_srt(weird) +assert len(parsed) == 2 +assert parsed[0].final_start == 0.0 and parsed[0].final_end == 3.0 +assert parsed[0].text == "hello" and parsed[1].text == "world" +ok("parse_srt tolerates cue settings end-to-end") + + +# -- 5. concat_quote_path edge cases ------------------------------------- + +section("concat_quote_path edge cases") +cases = [ + (Path("/tmp/foo.mp4"), "'/tmp/foo.mp4'"), + (Path("/tmp/foo bar.mp4"), "'/tmp/foo bar.mp4'"), + (Path("/tmp/it's.mp4"), "'/tmp/it'\\''s.mp4'"), + (Path("/tmp/he said 'hi'.mp4"), "'/tmp/he said '\\''hi'\\''.mp4'"), +] +for p, _expected in cases: + got = concat_quote_path(p) + # We only check the structural pattern: start/end with single quote, + # any embedded single-quotes are properly close-escape-reopened. + assert got.startswith("'") and got.endswith("'"), f"{p}: {got}" + # Verify reverse — closing+escape+reopen idiom for any input apostrophe + if "'" in p.as_posix(): + assert "'\\''" in got, f"{p}: {got}" + ok(f"{p.as_posix()!r:<35} → {got}") + +# CJK paths — verify it doesn't barf and produces a quoted UTF-8 string +# Note: concat_quote_path calls .resolve() which prepends a drive letter on +# Windows, so compare against the resolved posix form, not the literal input. +cjk_p = Path("/tmp/视频 v2/片段.mp4") +got = concat_quote_path(cjk_p) +assert got == f"'{cjk_p.resolve().as_posix()}'" +assert "视频" in got and "片段" in got +ok(f"CJK + space preserved: {got}") + + +# -- 6. make_safe_work_dir produces ASCII path --------------------------- + +section("make_safe_work_dir") +plan_with_cjk_path = tmp / "中文 plan.json" +plan_with_cjk_path.write_text("[]", encoding="utf-8") +wd = make_safe_work_dir("我的剪辑 v2!", plan_with_cjk_path) +assert wd.exists() and wd.is_dir() +# Path must be ASCII-only (no CJK leaks) +assert all(ord(c) < 128 for c in str(wd)), f"work dir not ASCII: {wd}" +assert "srt_edit_" in wd.name +ok(f"work dir is ASCII: {wd}") + +# Re-creating wipes previous contents (deterministic) +sentinel = wd / "_stale.txt" +sentinel.write_text("old") +wd2 = make_safe_work_dir("我的剪辑 v2!", plan_with_cjk_path) +assert wd2 == wd +assert not sentinel.exists() +ok("rerun wipes stale contents") + + +# -- 7. Sync tails defined and reasonable -------------------------------- + +section("Sync tail constants") +assert "fps=24" in V_SYNC_TAIL and "setpts=PTS-STARTPTS" in V_SYNC_TAIL +assert "aresample=async=1" in A_SYNC_TAIL and "asetpts=PTS-STARTPTS" in A_SYNC_TAIL +ok(f"V_SYNC_TAIL = {V_SYNC_TAIL}") +ok(f"A_SYNC_TAIL = {A_SYNC_TAIL}") + + +# -- 8. Strict validation ------------------------------------------------- + +section("Validation errors hard-fail") +import json as _j + +# duplicate id in SRT +bad = tmp / "dup.srt" +bad.write_text("1\n00:00:00,000 --> 00:00:01,000\na\n\n1\n00:00:01,000 --> 00:00:02,000\nb\n", encoding="utf-8") +try: + validate_srt(parse_srt(bad)) + fail("dup id should have errored") +except SystemExit as e: + ok(f"dup id: {e}") + +# overlap +bad.write_text("1\n00:00:00,000 --> 00:00:03,000\na\n\n2\n00:00:02,000 --> 00:00:04,000\nb\n", encoding="utf-8") +try: + validate_srt(parse_srt(bad)) + fail("overlap should have errored") +except SystemExit as e: + ok(f"overlap: {e}") + +# non-monotonic +bad.write_text("1\n00:00:05,000 --> 00:00:07,000\na\n\n2\n00:00:00,000 --> 00:00:02,000\nb\n", encoding="utf-8") +try: + validate_srt(parse_srt(bad)) + fail("non-monotonic should have errored") +except SystemExit as e: + ok(f"non-monotonic: {e}") + +# end <= start in plan +bad_plan = tmp / "bad_plan.json" +bad_plan.write_text(_j.dumps([{"id": 1, "source_start": "00:00:05,000", "source_end": "00:00:03,000"}]), encoding="utf-8") +try: + s, v, ents = parse_plan(bad_plan) + validate_plan(ents, s, v, Path("/fake/source.mp4")) + fail("end<=start should have errored") +except SystemExit as e: + ok(f"end<=start: {e}") + +# negative source_start +bad_plan.write_text(_j.dumps([{"id": 1, "source_start": "00:00:00,000", "source_end": "00:00:03,000"}]), encoding="utf-8") +s, v, ents = parse_plan(bad_plan) +ents[0].source_start = -1.0 +try: + validate_plan(ents, s, v, Path("/fake/source.mp4")) + fail("negative start should have errored") +except SystemExit as e: + ok(f"negative start: {e}") + +# id mismatch +ok_srt = parse_srt(base / "script.srt") +s, v, ents = parse_plan(base / "edit_plan.json") +from srt_driven_edit import PlanEntry +ents.append(PlanEntry(id=99, source_name="_default", source_start=0.0, source_end=1.0, voice_name=None)) +try: + validate_alignment(ok_srt, ents) + fail("id mismatch should have errored") +except SystemExit as e: + ok(f"id mismatch: {e}") + + +# -- 9. Alignment + gap handling on real example ------------------------- + +section("alignment on script.srt + edit_plan.json") +s, v, ents = parse_plan(base / "edit_plan.json") +segs = align(parse_srt(base / "script.srt"), ents, s, v, + legacy_default_source=Path("/fake/source.mp4"), + tolerance=0.5, trim_direction="tail", on_short="error") +for sg in segs: + print(f" id={sg.id} src[{sg.source_start:.3f}-{sg.source_end:.3f}] " + f"out[{sg.out_start:.3f}-{sg.out_end:.3f}] gap={sg.leading_gap:.3f}") +assert abs(segs[-1].out_end - 12.0) < 1e-6 +assert abs(segs[2].leading_gap - 1.5) < 1e-6 +ok("12.0s total, 1.5s gap before id=3") + + + +# -- 10. ensure_safe_subs_path self-defense ------------------------------ + +section("ensure_safe_subs_path") +# safe path (already ASCII, no single quote): returned as-is +safe_in = tmp / "plain.srt" +safe_in.write_text("1\n00:00:00,000 --> 00:00:01,000\nhi\n", encoding="utf-8") +out, cleanup = ensure_safe_subs_path(safe_in) +assert out == safe_in and cleanup is None +ok(f"ascii input returned as-is: {out.name}") + +# unsafe path: CJK in name → copied to safe location +cjk_in = tmp / "中文 字幕.srt" +cjk_in.write_text("1\n00:00:00,000 --> 00:00:01,000\nhi\n", encoding="utf-8") +out, cleanup = ensure_safe_subs_path(cjk_in) +assert out != cjk_in and cleanup == out +assert str(out).isascii(), f"safe copy still has non-ASCII chars: {out}" +assert out.read_text(encoding="utf-8").startswith("1") +ok(f"CJK input copied to safe path: {out}") +cleanup.unlink() + +# unsafe path: single quote in name → also copied +quote_in = tmp / "it's mine.srt" +quote_in.write_text("1\n00:00:00,000 --> 00:00:01,000\nhi\n", encoding="utf-8") +out, cleanup = ensure_safe_subs_path(quote_in) +assert out != quote_in and "'" not in str(out) +ok(f"single-quote input copied to safe path: {out}") +cleanup.unlink() + +# unsafe + non-UTF-8 input gets normalized through read_srt_text +gbk_in = tmp / "gbk 字幕.srt" +gbk_in.write_bytes("1\n00:00:00,000 --> 00:00:01,000\n中文\n".encode("gb18030")) +out, cleanup = ensure_safe_subs_path(gbk_in) +assert "中文" in out.read_text(encoding="utf-8") +ok(f"GB18030 + CJK path → normalized utf-8 safe copy") +cleanup.unlink() + +# _path_is_filter_safe sanity +assert _path_is_filter_safe(Path("/tmp/foo.srt")) is True +assert _path_is_filter_safe(Path("/tmp/视频.srt")) is False +assert _path_is_filter_safe(Path("/tmp/it's.srt")) is False +ok("_path_is_filter_safe correctly flags non-ASCII and single quote") + + +# -- 11. Cache key fingerprinting --------------------------------------- + +section("cache_key includes params fingerprint + ffmpeg version") +assert isinstance(PARAMS_FINGERPRINT, str) and len(PARAMS_FINGERPRINT) == 10 +ok(f"PARAMS_FINGERPRINT = {PARAMS_FINGERPRINT}") +assert CACHE_VERSION == 2 +ok(f"CACHE_VERSION bumped to {CACHE_VERSION}") + +# Build a fake segment pointed at a real file (this script) so _file_fingerprint works +fake_seg = Segment( + id=1, + source_path=Path(__file__).resolve(), + source_start=0.0, + source_end=1.0, + out_start=0.0, + out_end=1.0, + leading_gap=0.0, + text="x", + voice_path=None, + pad_short=False, + plan_src_dur=1.0, +) +k_v60 = cache_key(fake_seg, effective_bg_volume=0.0, hdr=False, portrait=False, + voice_signature=None, ffmpeg_version="6.0") +k_v71 = cache_key(fake_seg, effective_bg_volume=0.0, hdr=False, portrait=False, + voice_signature=None, ffmpeg_version="7.1") +assert k_v60 != k_v71, "different ffmpeg versions should produce different cache keys" +ok(f"ffmpeg 6.0 → {k_v60[:16]}…, 7.1 → {k_v71[:16]}… (differ)") + +k_bg0 = cache_key(fake_seg, effective_bg_volume=0.0, hdr=False, portrait=False, + voice_signature=None, ffmpeg_version="6.0") +k_bg1 = cache_key(fake_seg, effective_bg_volume=0.1, hdr=False, portrait=False, + voice_signature=None, ffmpeg_version="6.0") +assert k_bg0 != k_bg1, "different effective bg_volume must invalidate cache" +ok("effective bg_volume differs → cache key differs") + + +# -- 12. preflight + probe_streams (best-effort, ffmpeg may be absent) --- + +section("preflight + probe_streams (only if ffmpeg installed)") +import shutil as _sh +import subprocess as _sp +if _sh.which("ffmpeg") and _sh.which("ffprobe"): + from srt_driven_edit import preflight, probe_streams + versions = preflight() + assert "ffmpeg" in versions and "ffprobe" in versions + ok(f"preflight ok: {versions}") + + # Build a 0.5s test mp4 with video + audio via lavfi + av_mp4 = tmp / "probe_av.mp4" + _sp.run([ + "ffmpeg", "-y", "-hide_banner", "-loglevel", "error", + "-f", "lavfi", "-i", "color=c=red:s=320x240:r=24:d=0.5", + "-f", "lavfi", "-i", "anullsrc=channel_layout=stereo:sample_rate=48000", + "-t", "0.5", + "-c:v", "libx264", "-pix_fmt", "yuv420p", + "-c:a", "aac", + str(av_mp4), + ], check=True) + info = probe_streams(av_mp4) + assert info["has_video"] is True and info["has_audio"] is True + assert abs(info["duration"] - 0.5) < 0.1 + ok(f"probe video+audio mp4: {info}") + + # Video-only mp4 → has_audio False, exercises the auto-degrade path + v_only = tmp / "probe_vonly.mp4" + _sp.run([ + "ffmpeg", "-y", "-hide_banner", "-loglevel", "error", + "-f", "lavfi", "-i", "color=c=red:s=320x240:r=24:d=0.5", + "-an", "-t", "0.5", + "-c:v", "libx264", "-pix_fmt", "yuv420p", + str(v_only), + ], check=True) + info = probe_streams(v_only) + assert info["has_video"] is True and info["has_audio"] is False + ok(f"probe video-only mp4: {info}") + + # Garbage input → SystemExit, not a silent pass + junk = tmp / "junk.mp4" + junk.write_bytes(b"not a media file") + try: + probe_streams(junk) + fail("probe_streams on garbage should have raised") + except SystemExit as e: + ok(f"probe_streams hard-fails on junk: {str(e)[:80]}") +else: + ok("ffmpeg not on PATH — preflight/probe_streams tests skipped") + + +print("\n=== ALL TESTS PASSED ===") diff --git a/examples/srt_driven/edit_plan.json b/examples/srt_driven/edit_plan.json new file mode 100644 index 0000000..0e15d43 --- /dev/null +++ b/examples/srt_driven/edit_plan.json @@ -0,0 +1,17 @@ +[ + { + "id": 1, + "source_start": "00:12:30,000", + "source_end": "00:12:33,000" + }, + { + "id": 2, + "source_start": "00:18:05,000", + "source_end": "00:18:09,000" + }, + { + "id": 3, + "source_start": "00:22:14,500", + "source_end": "00:22:18,000" + } +] diff --git a/examples/srt_driven/edit_plan_v2.json b/examples/srt_driven/edit_plan_v2.json new file mode 100644 index 0000000..fd7530b --- /dev/null +++ b/examples/srt_driven/edit_plan_v2.json @@ -0,0 +1,32 @@ +{ + "sources": { + "A": "raw/take_a.mp4", + "B": "raw/take_b.mp4" + }, + "voices": { + "host": "voice/host.wav", + "guest": "voice/guest.wav" + }, + "segments": [ + { + "id": 1, + "source": "A", + "source_start": "00:12:30,000", + "source_end": "00:12:33,000", + "voice": "host" + }, + { + "id": 2, + "source": "B", + "source_start": "00:18:05,000", + "source_end": "00:18:09,000", + "voice": "guest" + }, + { + "id": 3, + "source": "A", + "source_start": "00:22:14,500", + "source_end": "00:22:18,000" + } + ] +} diff --git a/examples/srt_driven/jobs.json b/examples/srt_driven/jobs.json new file mode 100644 index 0000000..88f1aef --- /dev/null +++ b/examples/srt_driven/jobs.json @@ -0,0 +1,25 @@ +[ + { + "name": "promo_en", + "source": "raw/take_a.mp4", + "srt": "script.srt", + "plan": "edit_plan.json", + "bg_volume": 0.0, + "style": "bold-uppercase" + }, + { + "name": "promo_cn", + "source": "raw/take_a.mp4", + "srt": "script_cjk.srt", + "plan": "edit_plan.json", + "bg_volume": 0.1, + "style": "cjk-natural" + }, + { + "name": "promo_multi", + "srt": "script.srt", + "plan": "edit_plan_v2.json", + "bg_volume": 0.0, + "style": "auto" + } +] diff --git a/examples/srt_driven/script.srt b/examples/srt_driven/script.srt new file mode 100644 index 0000000..dde4617 --- /dev/null +++ b/examples/srt_driven/script.srt @@ -0,0 +1,11 @@ +1 +00:00:00,000 --> 00:00:03,000 +Ninety percent of what an agent does is wasted. + +2 +00:00:03,000 --> 00:00:07,000 +We rewrote the planner from scratch this quarter. + +3 +00:00:08,500 --> 00:00:12,000 +Here is what changed and what it cost us. diff --git a/examples/srt_driven/script_cjk.srt b/examples/srt_driven/script_cjk.srt new file mode 100644 index 0000000..04dc35f --- /dev/null +++ b/examples/srt_driven/script_cjk.srt @@ -0,0 +1,11 @@ +1 +00:00:00,000 --> 00:00:03,000 +百分之九十的 agent 工作都被浪费了。 + +2 +00:00:03,000 --> 00:00:07,000 +我们这季度把 planner 重写了一遍。 + +3 +00:00:08,500 --> 00:00:12,000 +这里讲一下改了什么、代价是什么。 diff --git a/helpers/srt_driven_edit.py b/helpers/srt_driven_edit.py new file mode 100644 index 0000000..3cfa052 --- /dev/null +++ b/helpers/srt_driven_edit.py @@ -0,0 +1,1522 @@ +"""SRT-driven edit: assemble a final cut by aligning source ranges to an SRT. + +Independent pipeline. Does NOT touch the main render.py flow. Use when you +have a finished script (script.srt = final captions timeline) and a list of +source ranges keyed by SRT id. + +Pipeline: + parse SRT + plan ─> strict validate ─> align ─> resolve style + ─> extract segments (with cache) ─> insert gap clips ─> concat + ─> audio replace/mix + subtitle burn LAST (Hard Rule 1) ─> QC report + +Schemas (both forms accepted): + + Form A — array, single source (legacy): + [{"id": 1, "source_start": "HH:MM:SS,ms", "source_end": "HH:MM:SS,ms"}, ...] + + CLI --source + + Form B — object, multi-source / multi-voice: + { + "sources": {"A": "path/a.mp4", "B": "path/b.mp4"}, + "voices": {"main": "path/v.wav"}, + "segments": [ + {"id": 1, "source": "A", "source_start": "...", "source_end": "...", + "voice": "main"}, + {"id": 2, "source": "B", "source_start": "...", "source_end": "..."} + ] + } + +Batch: + --batch jobs.json (array of per-job dicts, same fields as CLI flags) + --batch jobs.csv (header row of the same fields) +""" + +from __future__ import annotations + +import argparse +import csv +import hashlib +import json +import os +import re +import shutil +import subprocess +import sys +import tempfile +import time +from dataclasses import dataclass, field, asdict +from pathlib import Path +from typing import Any + +try: + from render import ( + SUB_FORCE_STYLE as _RENDER_SUB_STYLE, + TONEMAP_CHAIN, + is_hdr_source, + is_portrait_source, + ) +except Exception: + _RENDER_SUB_STYLE = ( + "FontName=Helvetica,FontSize=18,Bold=1," + "PrimaryColour=&H00FFFFFF,OutlineColour=&H00000000,BackColour=&H00000000," + "BorderStyle=1,Outline=2,Shadow=0," + "Alignment=2,MarginV=90" + ) + TONEMAP_CHAIN = "" + + def is_hdr_source(video: Path) -> bool: # type: ignore + return False + + def is_portrait_source(video: Path) -> bool: # type: ignore + return False + + +# ============================================================================ +# Constants +# ============================================================================ + +FPS = 24 +SAMPLE_RATE = 48000 +AUDIO_BITRATE = "192k" +DURATION_DRIFT_TOLERANCE_S = 0.2 + +STYLE_TEMPLATES: dict[str, str] = { + "bold-uppercase": _RENDER_SUB_STYLE, + "cjk-natural": ( + "FontName=Microsoft YaHei UI,FontSize=20,Bold=0," + "PrimaryColour=&H00FFFFFF,OutlineColour=&H00000000,BackColour=&H00000000," + "BorderStyle=1,Outline=2,Shadow=0," + "Alignment=2,MarginV=90" + ), + "narrative": ( + "FontName=Helvetica,FontSize=20,Bold=0," + "PrimaryColour=&H00FFFFFF,OutlineColour=&H00000000,BackColour=&H00000000," + "BorderStyle=1,Outline=2,Shadow=0," + "Alignment=2,MarginV=80" + ), +} + +CJK_RE = re.compile( + r"[一-鿿㐀-䶿぀-ゟ゠-ヿ가-힯]" +) + +CACHE_VERSION = 2 # bumped: cache now keyed by ffmpeg version + encoding params + +# Encoding-affecting constants captured into a single fingerprint so that +# any later tweak to codec / preset / sync tails forces a cache miss. If you +# change PARAMS_FINGERPRINT's inputs, existing cached clips are auto-invalidated. +def _params_fingerprint() -> str: + payload = repr([ + "fps", 24, + "sr", 48000, + "ab", "192k", + "ac", 2, + "v_codec", "libx264", "preset", "fast", "crf", 20, "pix", "yuv420p", + "a_codec", "aac", + ]) + return hashlib.sha1(payload.encode("utf-8")).hexdigest()[:10] + + +# Encodings tried in order when reading user-supplied SRT files. Windows +# Chinese systems frequently save as GBK/GB18030; macOS / *nix typically +# UTF-8 (with or without BOM). cp1252 is the last-resort Western Latin1. +SRT_ENCODINGS = ("utf-8-sig", "utf-8", "gb18030", "cp936", "cp1252") + +# Audio/video sync tails appended to every per-segment filter chain so that +# each extracted clip starts at PTS 0 with monotonic timestamps. Without +# these, concatenating many short clips accumulates sub-frame drift that +# eventually desyncs voice from picture. +V_SYNC_TAIL = f"fps={FPS},setpts=PTS-STARTPTS" +A_SYNC_TAIL = "aresample=async=1:first_pts=0,asetpts=PTS-STARTPTS" + +PARAMS_FINGERPRINT = _params_fingerprint() + + +# ============================================================================ +# Path / filter escaping +# ============================================================================ + + +def subs_filter_escape(path: Path) -> str: + """Escape a path for use inside ffmpeg's subtitles='...' filter argument. + + Order matters: backslashes first (Windows), then drive-letter colons, then + quotes. The path is returned in forward-slash form for libavfilter sanity. + """ + s = path.resolve().as_posix() + s = s.replace("\\", "\\\\") + s = s.replace(":", r"\:") + s = s.replace("'", r"\'") + return s + + +def safe_ascii_name(stem: str) -> str: + """Reduce a filename stem to a safe ASCII slug for intermediate files.""" + s = re.sub(r"[^A-Za-z0-9._-]+", "_", stem) + s = s.strip("_") or "job" + return s[:48] + + +def concat_quote_path(p: Path) -> str: + """Quote a path for ffmpeg's concat demuxer 'file' directive. + + Embeds single quotes via the close-escape-reopen idiom: `'` -> `'\\''`. + Paths are normalized to posix form so backslashes do not become escape + sequences when libavformat parses the list. + """ + s = p.resolve().as_posix() + escaped = s.replace("'", "'\\''") + return f"'{escaped}'" + + +def read_srt_text(path: Path) -> str: + """Read an SRT with encoding fallback. + + Tries SRT_ENCODINGS in order; returns the first successful decode. + Raises SystemExit with a helpful message if none work. + """ + raw = path.read_bytes() + last_err: Exception | None = None + for enc in SRT_ENCODINGS: + try: + return raw.decode(enc) + except UnicodeDecodeError as e: + last_err = e + continue + raise SystemExit( + f"could not decode SRT {path} with any of {SRT_ENCODINGS}: {last_err}" + ) + + +def make_safe_work_dir(job_name: str, plan_path: Path) -> Path: + """Create (or reset) a safe ASCII-named temp dir for one job's intermediates. + + Lives under tempfile.gettempdir() so it never inherits CJK / quote / + space characters from the user's project path. Deterministic hash means + re-runs land in the same dir for debuggability. + """ + h = hashlib.sha1( + f"{plan_path.resolve().as_posix()}|{job_name}".encode("utf-8") + ).hexdigest()[:12] + p = Path(tempfile.gettempdir()) / f"srt_edit_{h}" + if p.exists(): + shutil.rmtree(p, ignore_errors=True) + p.mkdir(parents=True) + return p + + +def _path_is_filter_safe(p: Path) -> bool: + """Cheap libavfilter-path safety check: ASCII only and no single quotes.""" + s = str(p) + return s.isascii() and "'" not in s + + +def ensure_safe_subs_path(src: Path) -> tuple[Path, Path | None]: + """Return (path_to_feed_to_ffmpeg, cleanup_target_or_None). + + If src is already filter-safe, return it as-is and no cleanup target. + Otherwise copy to a deterministic ASCII path under the system temp dir + and return that, plus a handle the caller should unlink in finally. + + Decoded through read_srt_text so GB18030 / cp936 inputs become UTF-8. + """ + if _path_is_filter_safe(src): + return src, None + h = hashlib.sha1(src.resolve().as_posix().encode("utf-8")).hexdigest()[:12] + safe = Path(tempfile.gettempdir()) / f"srt_burn_{h}.srt" + safe.write_text(read_srt_text(src), encoding="utf-8") + return safe, safe + + +# ============================================================================ +# Preflight: tool availability + media stream probing +# ============================================================================ + + +_FFMPEG_VERSION_RE = re.compile(r"^ffmpeg version (\S+)") +_FFPROBE_VERSION_RE = re.compile(r"^ffprobe version (\S+)") + + +def preflight() -> dict[str, str]: + """Verify ffmpeg + ffprobe are on PATH and runnable. Return version dict. + + Used both for early failure and to fingerprint cache keys: encoding + behavior can shift between ffmpeg versions, so a version bump should + invalidate cached clips. + """ + info: dict[str, str] = {} + for tool, rx in (("ffmpeg", _FFMPEG_VERSION_RE), ("ffprobe", _FFPROBE_VERSION_RE)): + try: + r = subprocess.run( + [tool, "-version"], + capture_output=True, text=True, timeout=10, + encoding="utf-8", errors="replace", + ) + except FileNotFoundError: + raise SystemExit( + f"required tool not on PATH: {tool}. Install ffmpeg first " + f"(e.g. `winget install Gyan.FFmpeg` on Windows, " + f"`brew install ffmpeg` on macOS)." + ) + except subprocess.TimeoutExpired: + raise SystemExit(f"{tool} timed out on `-version`. Bad install?") + if r.returncode != 0: + raise SystemExit( + f"{tool} `-version` exited {r.returncode}: {(r.stderr or '')[:300]}" + ) + first_line = (r.stdout.splitlines() or [""])[0].strip() + m = rx.match(first_line) + info[tool] = m.group(1) if m else first_line[:40] or "unknown" + return info + + +def probe_streams(path: Path) -> dict: + """Probe a media file for {has_video, has_audio, duration}. + + Raises SystemExit on probe failure so the caller doesn't continue + blindly. Result is cheap to memoize per source path. + """ + try: + r = subprocess.run( + [ + "ffprobe", "-v", "error", + "-show_entries", "stream=codec_type", + "-show_entries", "format=duration", + "-of", "json", str(path), + ], + capture_output=True, text=True, check=True, + encoding="utf-8", errors="replace", + ) + except subprocess.CalledProcessError as e: + raise SystemExit( + f"ffprobe failed on {path}: {(e.stderr or '')[:300]}" + ) + data = json.loads(r.stdout) + types: set[str] = set() + for s in data.get("streams", []) or []: + t = s.get("codec_type") + if t: + types.add(t) + fmt = data.get("format") or {} + try: + duration = float(fmt.get("duration", 0.0)) + except (TypeError, ValueError): + duration = 0.0 + return { + "has_video": "video" in types, + "has_audio": "audio" in types, + "duration": duration, + } + + +# ============================================================================ +# Time parsing +# ============================================================================ + + +_TS_RE = re.compile(r"(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})") + + +def parse_timestamp(ts: str) -> float: + m = _TS_RE.fullmatch(ts.strip()) + if not m: + raise ValueError(f"bad timestamp: {ts!r}") + h, mn, s, ms = m.groups() + return int(h) * 3600 + int(mn) * 60 + int(s) + int(ms.ljust(3, "0")) / 1000.0 + + +def format_srt_ts(seconds: float) -> str: + total_ms = int(round(seconds * 1000)) + h, rem = divmod(total_ms, 3600_000) + m, rem = divmod(rem, 60_000) + s, ms = divmod(rem, 1000) + return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" + + +# ============================================================================ +# Dataclasses +# ============================================================================ + + +@dataclass +class SrtCue: + id: int + final_start: float + final_end: float + text: str + + @property + def duration(self) -> float: + return self.final_end - self.final_start + + +@dataclass +class PlanEntry: + id: int + source_name: str # key into sources map (Form A: synthetic "_default") + source_start: float + source_end: float + voice_name: str | None = None # key into voices map + + @property + def duration(self) -> float: + return self.source_end - self.source_start + + +@dataclass +class Segment: + id: int + source_path: Path + source_start: float + source_end: float + out_start: float + out_end: float + leading_gap: float + text: str + voice_path: Path | None + pad_short: bool = False + plan_src_dur: float = 0.0 + + @property + def duration(self) -> float: + return self.out_end - self.out_start + + +# ============================================================================ +# SRT parser + validation +# ============================================================================ + + +def _split_time_line(line: str) -> tuple[str, str]: + """Split an SRT time line into (start_ts, end_ts) strings. + + Tolerates trailing cue settings like 'position:90% align:start' by + keeping only the first whitespace-delimited token on each side of '-->'. + """ + parts = line.split("-->", 1) + if len(parts) != 2: + raise ValueError(f"missing '-->' in time line: {line!r}") + left_tokens = parts[0].strip().split() + right_tokens = parts[1].strip().split() + if not left_tokens or not right_tokens: + raise ValueError(f"missing timestamps in time line: {line!r}") + return left_tokens[-1], right_tokens[0] + + +def parse_srt(path: Path) -> list[SrtCue]: + raw = read_srt_text(path) + blocks = re.split(r"\r?\n\r?\n+", raw.strip()) + cues: list[SrtCue] = [] + for block in blocks: + lines = [ln.rstrip() for ln in block.splitlines() if ln.strip() != ""] + if len(lines) < 2: + continue + try: + idx = int(lines[0].strip()) + except ValueError: + raise SystemExit(f"SRT block missing numeric id: {lines[0]!r}") + if "-->" not in lines[1]: + raise SystemExit(f"SRT block missing time line: {lines[1]!r}") + try: + a, b = _split_time_line(lines[1]) + start = parse_timestamp(a) + end = parse_timestamp(b) + except ValueError as e: + raise SystemExit(f"SRT id={lines[0]}: {e}") + cues.append(SrtCue(id=idx, final_start=start, final_end=end, + text="\n".join(lines[2:]))) + return cues + + +def validate_srt(cues: list[SrtCue]) -> None: + if not cues: + raise SystemExit("SRT has no cues") + seen: set[int] = set() + for c in cues: + if c.id in seen: + raise SystemExit(f"SRT duplicate id: {c.id}") + seen.add(c.id) + if c.final_end <= c.final_start: + raise SystemExit( + f"SRT id={c.id}: end {c.final_end:.3f} <= start {c.final_start:.3f}" + ) + if c.final_start < 0: + raise SystemExit(f"SRT id={c.id}: negative start {c.final_start:.3f}") + sorted_cues = sorted(cues, key=lambda x: x.id) + for i in range(1, len(sorted_cues)): + prev, cur = sorted_cues[i - 1], sorted_cues[i] + if cur.final_start < prev.final_start: + raise SystemExit( + f"SRT non-monotonic by id: id={cur.id} starts at " + f"{cur.final_start:.3f}s, earlier than id={prev.id} at " + f"{prev.final_start:.3f}s" + ) + if cur.final_start < prev.final_end - 1e-6: + raise SystemExit( + f"SRT cue overlap: id={prev.id} ends {prev.final_end:.3f}, " + f"id={cur.id} starts {cur.final_start:.3f}" + ) + + +# ============================================================================ +# Plan parser + validation +# ============================================================================ + + +def parse_plan(path: Path) -> tuple[dict[str, Path], dict[str, Path], list[PlanEntry]]: + """Returns (sources_map, voices_map, entries). Detects Form A vs B.""" + data = json.loads(path.read_text(encoding="utf-8")) + base = path.parent + + if isinstance(data, list): + entries: list[PlanEntry] = [] + for row in data: + entries.append(PlanEntry( + id=int(row["id"]), + source_name="_default", + source_start=parse_timestamp(row["source_start"]), + source_end=parse_timestamp(row["source_end"]), + voice_name=None, + )) + return {}, {}, entries + + if not isinstance(data, dict): + raise SystemExit("edit_plan must be a JSON array or object") + if "segments" not in data: + raise SystemExit("Form B plan missing 'segments' field") + + sources_map: dict[str, Path] = {} + for name, p in (data.get("sources") or {}).items(): + sp = Path(p) + if not sp.is_absolute(): + sp = (base / sp).resolve() + sources_map[name] = sp + + voices_map: dict[str, Path] = {} + for name, p in (data.get("voices") or {}).items(): + vp = Path(p) + if not vp.is_absolute(): + vp = (base / vp).resolve() + voices_map[name] = vp + + entries = [] + for row in data["segments"]: + entries.append(PlanEntry( + id=int(row["id"]), + source_name=str(row["source"]), + source_start=parse_timestamp(row["source_start"]), + source_end=parse_timestamp(row["source_end"]), + voice_name=row.get("voice"), + )) + return sources_map, voices_map, entries + + +def validate_plan( + entries: list[PlanEntry], + sources_map: dict[str, Path], + voices_map: dict[str, Path], + legacy_default_source: Path | None, +) -> None: + if not entries: + raise SystemExit("edit_plan has no segments") + seen: set[int] = set() + for e in entries: + if e.id in seen: + raise SystemExit(f"plan duplicate id: {e.id}") + seen.add(e.id) + if e.source_start < 0: + raise SystemExit(f"plan id={e.id}: negative source_start {e.source_start}") + if e.source_end <= e.source_start: + raise SystemExit( + f"plan id={e.id}: source_end {e.source_end:.3f} <= " + f"source_start {e.source_start:.3f}" + ) + if e.source_name == "_default": + if legacy_default_source is None: + raise SystemExit( + "Form A plan requires --source at the CLI" + ) + else: + if e.source_name not in sources_map: + raise SystemExit( + f"plan id={e.id}: source '{e.source_name}' not in sources map" + ) + if e.voice_name is not None and e.voice_name not in voices_map: + raise SystemExit( + f"plan id={e.id}: voice '{e.voice_name}' not in voices map" + ) + for name, sp in sources_map.items(): + if not sp.exists(): + raise SystemExit(f"source '{name}' missing on disk: {sp}") + for name, vp in voices_map.items(): + if not vp.exists(): + raise SystemExit(f"voice '{name}' missing on disk: {vp}") + if legacy_default_source is not None and not legacy_default_source.exists(): + raise SystemExit(f"--source missing on disk: {legacy_default_source}") + + +def validate_alignment(cues: list[SrtCue], entries: list[PlanEntry]) -> None: + cue_ids = {c.id for c in cues} + plan_ids = {e.id for e in entries} + if cue_ids != plan_ids: + only_srt = cue_ids - plan_ids + only_plan = plan_ids - cue_ids + msg = [] + if only_srt: + msg.append(f"in SRT but not in plan: {sorted(only_srt)}") + if only_plan: + msg.append(f"in plan but not in SRT: {sorted(only_plan)}") + raise SystemExit("id mismatch: " + "; ".join(msg)) + + +# ============================================================================ +# Alignment +# ============================================================================ + + +def align( + cues: list[SrtCue], + entries: list[PlanEntry], + sources_map: dict[str, Path], + voices_map: dict[str, Path], + legacy_default_source: Path | None, + tolerance: float, + trim_direction: str, + on_short: str, +) -> list[Segment]: + cue_by_id = {c.id: c for c in cues} + plan_by_id = {e.id: e for e in entries} + + segments: list[Segment] = [] + prev_out_end = 0.0 + for cid in sorted(cue_by_id): + cue = cue_by_id[cid] + pln = plan_by_id[cid] + src_dur = pln.duration + target = cue.duration + + pad_short = False + if src_dur + tolerance < target: + short_by = target - src_dur + if on_short == "error": + raise SystemExit( + f"id={cid}: source is {short_by:.3f}s shorter than SRT target " + f"({src_dur:.3f}s vs {target:.3f}s). Pass --on-short=pad to " + f"freeze-pad the tail, or extend the source range." + ) + pad_short = True + src_start = pln.source_start + src_end = pln.source_end + elif src_dur > target + tolerance: + if trim_direction == "tail": + src_start = pln.source_start + src_end = pln.source_start + target + elif trim_direction == "head": + src_start = pln.source_end - target + src_end = pln.source_end + elif trim_direction == "center": + overhang = (src_dur - target) / 2 + src_start = pln.source_start + overhang + src_end = pln.source_end - overhang + else: + raise ValueError(f"unknown trim_direction: {trim_direction}") + else: + src_start = pln.source_start + src_end = pln.source_start + target + + if pln.source_name == "_default": + assert legacy_default_source is not None + source_path = legacy_default_source + else: + source_path = sources_map[pln.source_name] + + voice_path = voices_map[pln.voice_name] if pln.voice_name else None + gap = max(0.0, cue.final_start - prev_out_end) + segments.append(Segment( + id=cid, + source_path=source_path, + source_start=src_start, + source_end=src_end, + out_start=cue.final_start, + out_end=cue.final_end, + leading_gap=gap, + text=cue.text, + voice_path=voice_path, + pad_short=pad_short, + plan_src_dur=src_dur, + )) + prev_out_end = cue.final_end + + return segments + + +# ============================================================================ +# Style resolution +# ============================================================================ + + +def has_cjk(cues: list[SrtCue]) -> bool: + return any(CJK_RE.search(c.text) for c in cues) + + +def resolve_style(style_arg: str, cues: list[SrtCue]) -> str: + if style_arg == "auto": + return STYLE_TEMPLATES["cjk-natural" if has_cjk(cues) else "bold-uppercase"] + if style_arg in STYLE_TEMPLATES: + return STYLE_TEMPLATES[style_arg] + if "=" in style_arg: + return style_arg + raise SystemExit( + f"unknown style: {style_arg!r}. Known templates: " + f"{sorted(STYLE_TEMPLATES)}. Pass a raw ASS string with '=' to override." + ) + + +# ============================================================================ +# Clip cache +# ============================================================================ + + +def _file_fingerprint(path: Path) -> tuple[int, int]: + st = path.stat() + return (int(st.st_mtime_ns), st.st_size) + + +def cache_key(seg: Segment, effective_bg_volume: float, hdr: bool, + portrait: bool, voice_signature: tuple | None, + ffmpeg_version: str) -> str: + fp = _file_fingerprint(seg.source_path) + payload = json.dumps([ + CACHE_VERSION, + str(seg.source_path.resolve()), fp[0], fp[1], + round(seg.source_start, 4), round(seg.source_end, 4), + round(seg.duration, 4), + round(effective_bg_volume, 4), + hdr, portrait, + seg.pad_short, round(seg.plan_src_dur, 4), + PARAMS_FINGERPRINT, + ffmpeg_version, + voice_signature, + ], sort_keys=True) + return hashlib.sha256(payload.encode()).hexdigest()[:32] + + +def cache_lookup(cache_dir: Path, key: str) -> Path | None: + p = cache_dir / f"{key}.mp4" + return p if p.exists() else None + + +def cache_store(cache_dir: Path, key: str, clip_path: Path) -> None: + cache_dir.mkdir(parents=True, exist_ok=True) + shutil.copy2(clip_path, cache_dir / f"{key}.mp4") + + +# ============================================================================ +# ffmpeg orchestration +# ============================================================================ + + +def run_ff(cmd: list[str], desc: str) -> None: + print(f" $ {desc}") + proc = subprocess.run(cmd, capture_output=True, text=True, + encoding="utf-8", errors="replace") + if proc.returncode != 0: + sys.stderr.write(proc.stderr or "") + raise SystemExit(f"ffmpeg failed: {desc}") + + +def probe_duration(path: Path) -> float: + out = subprocess.run( + ["ffprobe", "-v", "error", "-show_entries", "format=duration", + "-of", "default=noprint_wrappers=1:nokey=1", str(path)], + capture_output=True, text=True, check=True, + ) + return float(out.stdout.strip()) + + +def scale_filter_for(source: Path) -> str: + return "scale=-2:1920" if is_portrait_source(source) else "scale=1920:-2" + + +def _voice_signature(voice_path: Path | None, target: float) -> tuple | None: + if voice_path is None: + return None + fp = _file_fingerprint(voice_path) + return (str(voice_path.resolve()), fp[0], fp[1], round(target, 4)) + + +def extract_segment( + seg: Segment, + out_path: Path, + bg_volume: float, +) -> None: + """Extract one segment to 1080p 24fps with audio resolved per-segment. + + `bg_volume` here is the EFFECTIVE level — callers must already have + zeroed it for sources whose ffprobe says there is no audio track. + + Audio resolution: + voice_path present + bg_volume > 0 → mix voice + source*bg + voice_path present + bg_volume == 0 → voice only + voice_path absent + bg_volume > 0 → source audio at bg_volume (fades) + voice_path absent + bg_volume == 0 → silent + """ + keep_audio_from_source = bg_volume > 0.0 + out_path.parent.mkdir(parents=True, exist_ok=True) + target = seg.duration + + vf_parts: list[str] = [] + if is_hdr_source(seg.source_path): + vf_parts.append(TONEMAP_CHAIN) + vf_parts.append(scale_filter_for(seg.source_path)) + + if seg.pad_short and seg.plan_src_dur + 1e-6 < target: + vf_parts.append( + f"tpad=stop_mode=clone:stop_duration={target - seg.plan_src_dur:.3f}" + ) + v_input_dur = seg.plan_src_dur + else: + v_input_dur = target + + vf_parts.append(V_SYNC_TAIL) + vf = ",".join(vf_parts) + + inputs: list[str] = [ + "-ss", f"{seg.source_start:.3f}", + "-i", str(seg.source_path), + "-t", f"{v_input_dur:.3f}", + ] + + has_voice = seg.voice_path is not None + voice_index: int | None = None + if has_voice: + voice_index = 1 + inputs += ["-i", str(seg.voice_path)] + + # Audio filter graph — applied via -filter_complex when we have voice, + # otherwise simple -af on source audio. + audio_args: list[str] = [] + if has_voice and bg_volume <= 0.0: + fade_out = max(0.0, target - 0.03) + ac_parts = [ + f"[{voice_index}:a]apad=whole_dur={target:.3f}," + f"atrim=duration={target:.3f},asetpts=PTS-STARTPTS," + f"afade=t=in:st=0:d=0.03," + f"afade=t=out:st={fade_out:.3f}:d=0.03," + f"{A_SYNC_TAIL}[outa]" + ] + audio_args = ["-filter_complex", ";".join(ac_parts), + "-map", "[outa]"] + elif has_voice and bg_volume > 0.0: + fade_out = max(0.0, target - 0.03) + ac_parts = [ + f"[{voice_index}:a]apad=whole_dur={target:.3f}," + f"atrim=duration={target:.3f},asetpts=PTS-STARTPTS[voice]", + f"[0:a]volume={bg_volume:.3f}," + f"afade=t=in:st=0:d=0.03,afade=t=out:st={fade_out:.3f}:d=0.03[bg]", + f"[voice][bg]amix=inputs=2:duration=first:normalize=0," + f"{A_SYNC_TAIL}[outa]", + ] + audio_args = ["-filter_complex", ";".join(ac_parts), + "-map", "[outa]"] + elif not has_voice and keep_audio_from_source: + fade_out = max(0.0, target - 0.03) + af = ( + f"volume={bg_volume:.3f}," + f"afade=t=in:st=0:d=0.03,afade=t=out:st={fade_out:.3f}:d=0.03," + f"{A_SYNC_TAIL}" + ) + if seg.pad_short and seg.plan_src_dur + 1e-6 < target: + af = f"apad=whole_dur={target:.3f},{af}" + audio_args = ["-af", af, "-map", "0:a"] + else: + # silent track via lavfi so concat inputs share an audio stream + inputs += [ + "-f", "lavfi", "-t", f"{target:.3f}", + "-i", f"anullsrc=channel_layout=stereo:sample_rate={SAMPLE_RATE}", + ] + silent_idx = 2 if has_voice else 1 + audio_args = ["-af", A_SYNC_TAIL, "-map", f"{silent_idx}:a"] + + cmd: list[str] = [ + "ffmpeg", "-y", "-hide_banner", "-nostats", + *inputs, + "-vf", vf, "-r", str(FPS), + "-map", "0:v", + *audio_args, + "-c:v", "libx264", "-preset", "fast", "-crf", "20", + "-pix_fmt", "yuv420p", + "-c:a", "aac", "-b:a", AUDIO_BITRATE, "-ar", str(SAMPLE_RATE), "-ac", "2", + "-t", f"{target:.3f}", + "-movflags", "+faststart", + str(out_path), + ] + run_ff(cmd, f"extract id={seg.id} src[{seg.source_start:.2f}-{seg.source_end:.2f}] → {out_path.name}") + + +def make_gap_clip(duration: float, portrait: bool, out_path: Path) -> None: + out_path.parent.mkdir(parents=True, exist_ok=True) + size = "1080x1920" if portrait else "1920x1080" + cmd = [ + "ffmpeg", "-y", "-hide_banner", "-nostats", + "-f", "lavfi", "-i", f"color=c=black:s={size}:r={FPS}:d={duration:.3f}", + "-f", "lavfi", "-i", + f"anullsrc=channel_layout=stereo:sample_rate={SAMPLE_RATE}", + "-t", f"{duration:.3f}", + "-vf", V_SYNC_TAIL, + "-af", A_SYNC_TAIL, + "-c:v", "libx264", "-preset", "fast", "-crf", "20", + "-pix_fmt", "yuv420p", "-r", str(FPS), + "-c:a", "aac", "-b:a", AUDIO_BITRATE, "-ar", str(SAMPLE_RATE), + "-movflags", "+faststart", + str(out_path), + ] + run_ff(cmd, f"gap {duration:.3f}s → {out_path.name}") + + +def concat_clips(clip_paths: list[Path], out_path: Path, work_dir: Path) -> None: + """Concat losslessly via the demuxer. work_dir is assumed safe-ASCII. + + Each line is `file ` with the quoting routine that handles + spaces, single quotes, and CJK. Callers should register the list file + for cleanup BEFORE this is invoked so a mid-write failure still cleans up. + """ + concat_list = work_dir / "_concat_srt_driven.txt" + lines = [f"file {concat_quote_path(p)}\n" for p in clip_paths] + concat_list.write_text("".join(lines), encoding="utf-8") + cmd = [ + "ffmpeg", "-y", "-hide_banner", "-nostats", + "-f", "concat", "-safe", "0", + "-i", str(concat_list), + "-c", "copy", + "-movflags", "+faststart", + str(out_path), + ] + run_ff(cmd, f"concat {len(clip_paths)} clips → {out_path.name}") + + +def burn_subtitles( + base_path: Path, + subs_path: Path, + style: str, + fontsdir: Path | None, + out_path: Path, + *, + global_voice: Path | None = None, + total_duration: float = 0.0, +) -> None: + """Final pass: optional global-voice mix + subtitle burn (LAST). + + Self-defending on subs_path: if not filter-safe, copied to a deterministic + temp SRT first so libavfilter never sees the problematic original. + fontsdir, if given, must already be filter-safe — we error rather than + copy an entire font directory. + + Audio handling: + - global_voice is None: pass base audio through (`-c:a copy`). + - global_voice given: voice is apad'd / atrim'd to exactly total_duration + so it spans the entire output timeline, then mixed on top of base's + audio. Base already contains source*bg_volume (or silence) from + extract_segment, so we do NOT re-scale it here — that would double- + attenuate the background. amix uses duration=first so the result + runs exactly total_duration; normalize=0 keeps levels predictable. + """ + if fontsdir is not None and not _path_is_filter_safe(fontsdir): + raise SystemExit( + f"fontsdir contains non-ASCII or single-quote characters; " + f"move it to a safe ASCII path first: {fontsdir}" + ) + + safe_subs, cleanup_target = ensure_safe_subs_path(subs_path) + try: + subs_arg = subs_filter_escape(safe_subs) + style_escaped = style.replace("'", r"\'") + if fontsdir is not None: + fd = subs_filter_escape(fontsdir) + subs_filter = f"subtitles='{subs_arg}':fontsdir='{fd}':force_style='{style_escaped}'" + else: + subs_filter = f"subtitles='{subs_arg}':force_style='{style_escaped}'" + + if global_voice is None: + # No audio work — just burn subtitles, copy audio. + cmd = [ + "ffmpeg", "-y", "-hide_banner", "-nostats", + "-i", str(base_path), + "-vf", subs_filter, + "-c:v", "libx264", "-preset", "fast", "-crf", "18", + "-pix_fmt", "yuv420p", + "-c:a", "copy", + "-movflags", "+faststart", + str(out_path), + ] + label = f"subtitle burn (LAST) → {out_path.name}" + else: + if total_duration <= 0.0: + raise SystemExit( + "burn_subtitles: total_duration must be > 0 when global_voice is set" + ) + voice_chain = ( + f"[1:a]apad=whole_dur={total_duration:.3f}," + f"atrim=duration={total_duration:.3f}," + f"asetpts=PTS-STARTPTS," + f"{A_SYNC_TAIL}" + ) + # base [0:a] already contains source*bg_volume from extract; do NOT + # apply bg_volume again here. amix combines voice + existing base + # audio (which is silent on gaps and on segments with bg_volume=0). + filter_complex = ( + f"[0:v]{subs_filter}[outv];" + f"{voice_chain}[voice];" + f"[voice][0:a]amix=inputs=2:duration=first:normalize=0[outa]" + ) + cmd = [ + "ffmpeg", "-y", "-hide_banner", "-nostats", + "-i", str(base_path), + "-i", str(global_voice), + "-filter_complex", filter_complex, + "-map", "[outv]", "-map", "[outa]", + "-c:v", "libx264", "-preset", "fast", "-crf", "18", + "-pix_fmt", "yuv420p", + "-c:a", "aac", "-b:a", AUDIO_BITRATE, "-ar", str(SAMPLE_RATE), + "-movflags", "+faststart", + str(out_path), + ] + label = f"subtitle burn (LAST) + global voice mix → {out_path.name}" + + run_ff(cmd, label) + finally: + if cleanup_target is not None: + try: + cleanup_target.unlink() + except OSError: + pass + + +# ============================================================================ +# EDL + QC artifacts +# ============================================================================ + + +def write_edl(segments: list[Segment], srt: Path, plan: Path, + bg_volume: float, style_name: str, out_path: Path) -> None: + edl = { + "version": "srt-driven-2", + "script_srt": str(srt.resolve()), + "plan": str(plan.resolve()), + "bg_volume": bg_volume, + "style": style_name, + "segments": [ + { + "id": s.id, + "source": str(s.source_path.resolve()), + "source_start": format_srt_ts(s.source_start), + "source_end": format_srt_ts(s.source_end), + "out_start": format_srt_ts(s.out_start), + "out_end": format_srt_ts(s.out_end), + "duration": round(s.duration, 3), + "leading_gap": round(s.leading_gap, 3), + "voice": str(s.voice_path.resolve()) if s.voice_path else None, + "pad_short": s.pad_short, + "text": s.text, + } + for s in segments + ], + "total_duration_s": round(segments[-1].out_end, 3) if segments else 0.0, + } + out_path.write_text(json.dumps(edl, indent=2, ensure_ascii=False), encoding="utf-8") + print(f" EDL → {out_path.name}") + + +def _dir_size(path: Path) -> int: + if not path.exists(): + return 0 + total = 0 + for p in path.rglob("*"): + if p.is_file(): + total += p.stat().st_size + return total + + +def build_qc_report( + job_name: str, + segments: list[Segment], + seg_clip_info: list[dict], + output_path: Path, + expected_duration: float, + style_name: str, + style_resolved: str, + bg_volume: float, + has_any_voice: bool, + elapsed_s: float, + edit_dir: Path, + work_dir: Path, + cache_dir: Path, + out_qc_path: Path, +) -> dict: + actual_dur = probe_duration(output_path) + drift_ms = round((actual_dur - expected_duration) * 1000) + + audio_mode = ( + "voice_replace" if has_any_voice and bg_volume <= 0.0 + else "voice_mix" if has_any_voice + else "original_only" if bg_volume > 0.0 + else "silent" + ) + + seg_records = [] + for seg, info in zip(segments, seg_clip_info): + actual_seg = probe_duration(info["clip_path"]) if Path(info["clip_path"]).exists() else 0.0 + seg_records.append({ + "id": seg.id, + "expected_duration_s": round(seg.duration, 3), + "actual_duration_s": round(actual_seg, 3), + "drift_ms": round((actual_seg - seg.duration) * 1000), + "cached": info["cached"], + "clip_size_bytes": Path(info["clip_path"]).stat().st_size if Path(info["clip_path"]).exists() else 0, + "source": str(seg.source_path), + "voice": str(seg.voice_path) if seg.voice_path else None, + }) + + clips_size = sum(s["clip_size_bytes"] for s in seg_records) + final_size = output_path.stat().st_size + cache_size = _dir_size(cache_dir) + work_dir_size = _dir_size(work_dir) + + report = { + "job": job_name, + "ok": abs(actual_dur - expected_duration) <= DURATION_DRIFT_TOLERANCE_S, + "elapsed_s": round(elapsed_s, 2), + "duration": { + "expected_s": round(expected_duration, 3), + "actual_s": round(actual_dur, 3), + "drift_ms": drift_ms, + "tolerance_ms": int(DURATION_DRIFT_TOLERANCE_S * 1000), + "within_tolerance": abs(actual_dur - expected_duration) <= DURATION_DRIFT_TOLERANCE_S, + }, + "segments": seg_records, + "subtitles": { + "applied": True, + "style_name": style_name, + "force_style": style_resolved, + "cue_count": len(segments), + }, + "audio": { + "mode": audio_mode, + "bg_volume": bg_volume, + "voice_used": has_any_voice, + }, + "disk_usage_bytes": { + "work_dir_total": work_dir_size, + "clips_in_work_dir": clips_size, + "final_output": final_size, + "cache": cache_size, + }, + "output_path": str(output_path), + } + out_qc_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8") + print(f" QC report → {out_qc_path.name}") + return report + + +# ============================================================================ +# Single-job runner +# ============================================================================ + + +@dataclass +class Job: + source: Path | None # legacy single-source path; None if Form B + srt: Path + plan: Path + voice: Path | None # global voice override (mutually exclusive with per-segment) + bg_volume: float + tolerance: float + trim_direction: str + on_short: str + style: str + fontsdir: Path | None + output: Path | None + name: str + no_cache: bool + keep_intermediates: bool + no_overwrite: bool = False + + +def run_job(job: Job, ffmpeg_version: str) -> dict: + t0 = time.time() + print(f"\n== job: {job.name} ==") + + cues = parse_srt(job.srt) + validate_srt(cues) + sources_map, voices_map, entries = parse_plan(job.plan) + + legacy_source: Path | None = None + if sources_map: + if job.source is not None: + print(" note: --source ignored (plan defines its own sources)") + else: + if job.source is None: + raise SystemExit("Form A plan needs --source ") + legacy_source = job.source.resolve() + + has_per_seg_voice = any(e.voice_name for e in entries) + if job.voice is not None and has_per_seg_voice: + raise SystemExit( + "voice conflict: --voice given AND plan contains per-segment voices. " + "Pick one." + ) + + # Global voice is NOT expanded into per-segment entries. Per-segment voices + # play during their segment's window; a global voice spans the entire + # output timeline and is mixed in during the final compose step. Doing it + # at extract time would replay voice[0:seg_dur] for every segment, which + # is wrong for any voice longer than one segment. + global_voice: Path | None = job.voice + if global_voice is not None: + v_info = probe_streams(global_voice) + if not v_info["has_audio"]: + raise SystemExit(f"global --voice file has no audio track: {global_voice}") + print(f" global voice: {global_voice.name} ({v_info['duration']:.3f}s)") + + validate_plan(entries, sources_map, voices_map, legacy_source) + validate_alignment(cues, entries) + + # Probe every source once. Cache by Path to avoid repeat ffprobe calls + # when many segments share a source. + unique_sources: dict[str, Path] = {} + if legacy_source is not None: + unique_sources["_default"] = legacy_source + for name, p in sources_map.items(): + unique_sources[name] = p + + source_info: dict[str, dict] = {} + source_info_by_path: dict[Path, dict] = {} + print(" probing sources:") + for name, p in unique_sources.items(): + info = probe_streams(p) + source_info[name] = info + source_info_by_path[p] = info + print(f" {name}: video={info['has_video']} audio={info['has_audio']} " + f"duration={info['duration']:.3f}s") + if not info["has_video"]: + raise SystemExit(f"source '{name}' has no video stream: {p}") + + # Range bounds — fail fast rather than letting ffmpeg fail mid-batch. + for e in entries: + info = source_info[e.source_name] + if e.source_end > info["duration"] + job.tolerance: + raise SystemExit( + f"plan id={e.id}: source_end {e.source_end:.3f}s exceeds " + f"source '{e.source_name}' duration {info['duration']:.3f}s " + f"(tolerance ±{job.tolerance}s)" + ) + + # Effective bg_volume per source: if source has no audio track, force to 0 + # rather than letting ffmpeg fail on a missing 0:a stream reference. + no_audio_names = [n for n, info in source_info.items() if not info["has_audio"]] + if no_audio_names and job.bg_volume > 0.0: + print(f" WARNING: source(s) {no_audio_names} have no audio track — " + f"bg_volume forced to 0 for segments from them") + + segments = align( + cues, entries, sources_map, voices_map, legacy_source, + tolerance=job.tolerance, trim_direction=job.trim_direction, + on_short=job.on_short, + ) + + edit_dir = (job.output.parent if job.output else job.plan.parent / "edit") + edit_dir.mkdir(parents=True, exist_ok=True) + out_path = job.output.resolve() if job.output else ( + edit_dir / f"final_srt_driven_{safe_ascii_name(job.name)}.mp4" + ) + + if out_path.exists(): + if job.no_overwrite: + raise SystemExit(f"output exists and --no-overwrite set: {out_path}") + print(f" WARNING: overwriting existing output: {out_path}") + + style_resolved = resolve_style(job.style, cues) + print(f" style: {job.style} ({len(cues)} cues, cjk={has_cjk(cues)})") + + # All intermediates live in a safe-ASCII temp dir under tempfile.gettempdir(). + # Wiped at start so a previous crashed run cannot pollute. Wiped at end + # (in finally) unless --keep-intermediates is set. + work_dir = make_safe_work_dir(job.name, job.plan) + print(f" work dir: {work_dir}") + + try: + # SRT normalized to UTF-8 with encoding fallback (handles GB18030 input). + # Lives in the safe work dir so its path is guaranteed friendly to libass. + safe_subs = work_dir / "subs.srt" + safe_subs.write_text(read_srt_text(job.srt), encoding="utf-8") + + edl_path = edit_dir / f"edl_srt_driven_{safe_ascii_name(job.name)}.json" + write_edl(segments, job.srt, job.plan, job.bg_volume, job.style, edl_path) + + clips_dir = work_dir / "clips" + clips_dir.mkdir(parents=True, exist_ok=True) + cache_dir = edit_dir / "cache_srt_driven" + + portrait = is_portrait_source(segments[0].source_path) + + clip_paths: list[Path] = [] + seg_clip_info: list[dict] = [] + any_voice = any(s.voice_path is not None for s in segments) + + print(f"\n extracting {len(segments)} segments cache={'off' if job.no_cache else 'on'} voice={'per-seg' if any_voice else 'none'}") + for i, seg in enumerate(segments): + if seg.leading_gap > 0.001: + gap_path = clips_dir / f"gap_{i:02d}_{seg.leading_gap:.3f}.mp4" + if not gap_path.exists(): + make_gap_clip(seg.leading_gap, portrait, gap_path) + clip_paths.append(gap_path) + + seg_path = clips_dir / f"seg_{i:02d}_id{seg.id}.mp4" + voice_sig = _voice_signature(seg.voice_path, seg.duration) + + # Effective bg_volume for THIS segment: forced to 0 if its source + # has no audio track. Keeps ffmpeg from referencing a missing 0:a. + src_has_audio = source_info_by_path[seg.source_path]["has_audio"] + effective_bg = job.bg_volume if src_has_audio else 0.0 + + ck = cache_key( + seg, + effective_bg_volume=effective_bg, + hdr=is_hdr_source(seg.source_path), + portrait=portrait, + voice_signature=voice_sig, + ffmpeg_version=ffmpeg_version, + ) if not job.no_cache else None + + cached_hit = False + if ck and (hit := cache_lookup(cache_dir, ck)) is not None: + shutil.copy2(hit, seg_path) + print(f" [cache hit] id={seg.id} → {seg_path.name}") + cached_hit = True + else: + extract_segment(seg, seg_path, bg_volume=effective_bg) + if ck: + cache_store(cache_dir, ck, seg_path) + + clip_paths.append(seg_path) + seg_clip_info.append({"clip_path": str(seg_path), "cached": cached_hit}) + + base_path = work_dir / "base.mp4" + concat_clips(clip_paths, base_path, work_dir) + + total_duration = segments[-1].out_end + burn_subtitles( + base_path, safe_subs, style_resolved, job.fontsdir, out_path, + global_voice=global_voice, + total_duration=total_duration, + ) + + # QC voice flag must reflect EITHER per-segment OR global voice usage. + voice_used = any_voice or (global_voice is not None) + + qc_path = edit_dir / f"qc_report_{safe_ascii_name(job.name)}.json" + qc_report = build_qc_report( + job_name=job.name, + segments=segments, + seg_clip_info=seg_clip_info, + output_path=out_path, + expected_duration=total_duration, + style_name=job.style, + style_resolved=style_resolved, + bg_volume=job.bg_volume, + has_any_voice=voice_used, + elapsed_s=time.time() - t0, + edit_dir=edit_dir, + work_dir=work_dir, + cache_dir=cache_dir, + out_qc_path=qc_path, + ) + print(f"\n done in {qc_report['elapsed_s']}s, drift={qc_report['duration']['drift_ms']}ms") + return qc_report + + finally: + if job.keep_intermediates: + print(f" intermediates kept at: {work_dir}") + else: + shutil.rmtree(work_dir, ignore_errors=True) + + +# ============================================================================ +# Batch manifest +# ============================================================================ + + +def load_manifest(path: Path) -> list[dict]: + suffix = path.suffix.lower() + if suffix == ".json": + data = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(data, list): + raise SystemExit("batch manifest JSON must be an array of job dicts") + return data + if suffix == ".csv": + rows: list[dict] = [] + with path.open(newline="", encoding="utf-8-sig") as f: + for row in csv.DictReader(f): + rows.append({k: v for k, v in row.items() if v != ""}) + return rows + raise SystemExit(f"unsupported manifest format: {suffix}") + + +def job_from_dict(d: dict, defaults: argparse.Namespace, manifest_dir: Path, + idx: int) -> Job: + def _path(key: str) -> Path | None: + v = d.get(key) + if v in (None, ""): + return None + p = Path(v) + return p if p.is_absolute() else (manifest_dir / p).resolve() + + def _float(key: str, fb: float) -> float: + v = d.get(key) + return float(v) if v not in (None, "") else fb + + def _str(key: str, fb: str) -> str: + v = d.get(key) + return str(v) if v not in (None, "") else fb + + def _bool(key: str, fb: bool) -> bool: + v = d.get(key) + if isinstance(v, bool): + return v + if v in (None, ""): + return fb + return str(v).lower() in ("1", "true", "yes", "on") + + srt_path = _path("srt") + plan_path = _path("plan") + if srt_path is None: + raise SystemExit(f"manifest row {idx}: missing srt") + if plan_path is None: + raise SystemExit(f"manifest row {idx}: missing plan") + + job_name = _str("name", plan_path.stem) + explicit_output = _path("output") + if explicit_output is None: + # Auto-isolate outputs by index so two jobs with the same name never + # silently overwrite each other. + explicit_output = ( + manifest_dir / f"final_srt_driven_{safe_ascii_name(job_name)}_{idx:02d}.mp4" + ) + + return Job( + source=_path("source"), + srt=srt_path, + plan=plan_path, + voice=_path("voice"), + bg_volume=_float("bg_volume", defaults.bg_volume), + tolerance=_float("tolerance", defaults.tolerance), + trim_direction=_str("trim_direction", defaults.trim_direction), + on_short=_str("on_short", defaults.on_short), + style=_str("style", defaults.style), + fontsdir=_path("fontsdir"), + output=explicit_output, + name=job_name, + no_cache=_bool("no_cache", defaults.no_cache), + keep_intermediates=_bool("keep_intermediates", defaults.keep_intermediates), + no_overwrite=_bool("no_overwrite", defaults.no_overwrite), + ) + + +# ============================================================================ +# CLI +# ============================================================================ + + +def main() -> None: + ap = argparse.ArgumentParser(description="SRT-driven edit assembly") + ap.add_argument("--source", type=Path, default=None, + help="Form A: single source.mp4. Ignored if plan declares sources.") + ap.add_argument("--srt", type=Path, default=None, help="script.srt") + ap.add_argument("--plan", type=Path, default=None, help="edit_plan.json (Form A or B)") + ap.add_argument("--voice", type=Path, default=None, + help="Global voice.wav spanning the whole timeline. " + "Mutually exclusive with per-segment voices in the plan.") + ap.add_argument("--bg-volume", type=float, default=0.0, + help="original audio level (0.0=mute, 0.1=10%%). Default 0.0.") + ap.add_argument("--tolerance", type=float, default=0.5, + help="seconds. |source_dur - srt_dur| > tolerance triggers trim/error.") + ap.add_argument("--trim-direction", choices=["tail", "head", "center"], default="tail") + ap.add_argument("--on-short", choices=["error", "pad"], default="error") + ap.add_argument("--style", default="auto", + help=f"subtitle style. Templates: {sorted(STYLE_TEMPLATES)}. " + "'auto' picks cjk-natural if SRT has CJK, else bold-uppercase. " + "Pass a raw ASS string containing '=' to override.") + ap.add_argument("--fontsdir", type=Path, default=None, + help="extra fonts directory passed to libass.") + ap.add_argument("-o", "--output", type=Path, default=None) + ap.add_argument("--no-cache", action="store_true") + ap.add_argument("--no-overwrite", action="store_true", + help="refuse to run if output file already exists.") + ap.add_argument("--keep-intermediates", action="store_true", + help="keep the temp work dir (clips, base, concat list) after rendering.") + ap.add_argument("--batch", type=Path, default=None, + help="run a batch manifest (jobs.json or jobs.csv) instead.") + ap.add_argument("--continue-on-error", action="store_true", + help="when --batch: skip failing jobs instead of aborting.") + args = ap.parse_args() + + versions = preflight() + print(f"== preflight: ffmpeg {versions['ffmpeg']} / ffprobe {versions['ffprobe']} ==") + + if args.batch is not None: + manifest_path = args.batch.resolve() + rows = load_manifest(manifest_path) + results: list[dict] = [] + for i, row in enumerate(rows): + try: + job = job_from_dict(row, args, manifest_path.parent, i) + except SystemExit as e: + if args.continue_on_error: + print(f"[batch {i}] skipped: {e}") + results.append({"job": row.get("name", f"row{i}"), "ok": False, "error": str(e)}) + continue + raise + try: + results.append(run_job(job, versions["ffmpeg"])) + except SystemExit as e: + if args.continue_on_error: + print(f"[batch {i}] FAILED: {e}") + results.append({"job": job.name, "ok": False, "error": str(e)}) + continue + raise + summary_path = manifest_path.with_name(manifest_path.stem + "_qc_summary.json") + summary_path.write_text( + json.dumps({"jobs": results, "total": len(results), + "ok": sum(1 for r in results if r.get("ok"))}, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + print(f"\nbatch QC summary → {summary_path}") + ok = sum(1 for r in results if r.get("ok")) + print(f" {ok}/{len(results)} jobs ok") + return + + if args.srt is None or args.plan is None: + ap.error("--srt and --plan required (or use --batch)") + + job = Job( + source=args.source.resolve() if args.source else None, + srt=args.srt.resolve(), + plan=args.plan.resolve(), + voice=args.voice.resolve() if args.voice else None, + bg_volume=args.bg_volume, + tolerance=args.tolerance, + trim_direction=args.trim_direction, + on_short=args.on_short, + style=args.style, + fontsdir=args.fontsdir.resolve() if args.fontsdir else None, + output=args.output.resolve() if args.output else None, + name=args.plan.stem, + no_cache=args.no_cache, + keep_intermediates=args.keep_intermediates, + no_overwrite=args.no_overwrite, + ) + run_job(job, versions["ffmpeg"]) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index c2cff29..45ce7ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ [project.optional-dependencies] animations = ["manim"] +dev = ["pytest>=7"] [build-system] requires = ["setuptools>=61.0"] diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..7716455 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,153 @@ +"""Shared fixtures for srt_driven_edit pytest suite. + +Generates session-scoped synthetic media via ffmpeg's lavfi sources so the +real extract/concat/burn pipeline can be exercised without bundling binary +fixtures. +""" + +from __future__ import annotations + +import json +import shutil +import subprocess +import sys +from pathlib import Path + +import pytest + +# Make the skill's helpers/ importable as a flat package (matches the +# `python helpers/srt_driven_edit.py` invocation contract). +HELPERS = Path(__file__).resolve().parent.parent / "helpers" +sys.path.insert(0, str(HELPERS)) + + +FFMPEG = shutil.which("ffmpeg") +FFPROBE = shutil.which("ffprobe") + + +def pytest_collection_modifyitems(config, items): + """Auto-skip all tests in this dir if ffmpeg/ffprobe missing.""" + if FFMPEG and FFPROBE: + return + marker = pytest.mark.skip(reason="ffmpeg or ffprobe not on PATH") + for item in items: + item.add_marker(marker) + + +# --------------------------------------------------------------------------- +# Synthetic media (session-scoped — each costs a few seconds to render) +# --------------------------------------------------------------------------- + + +def _ffmpeg(*args: str) -> None: + cmd = ["ffmpeg", "-y", "-hide_banner", "-loglevel", "error", *args] + r = subprocess.run(cmd, capture_output=True, text=True, + encoding="utf-8", errors="replace") + if r.returncode != 0: + raise RuntimeError(f"ffmpeg failed:\n cmd: {' '.join(cmd)}\n stderr: {r.stderr}") + + +@pytest.fixture(scope="session") +def synth_av(tmp_path_factory) -> Path: + """30s 1080p@24 testsrc2 + 440Hz sine. Spans long enough for sub-second cuts.""" + d = tmp_path_factory.mktemp("synth") + out = d / "av.mp4" + _ffmpeg( + "-f", "lavfi", "-i", "testsrc2=size=1920x1080:rate=24:duration=30", + "-f", "lavfi", "-i", "sine=frequency=440:duration=30", + "-c:v", "libx264", "-preset", "ultrafast", "-pix_fmt", "yuv420p", + "-c:a", "aac", "-b:a", "128k", "-ar", "48000", + "-shortest", + str(out), + ) + return out + + +@pytest.fixture(scope="session") +def synth_v_only(tmp_path_factory) -> Path: + """30s 1080p video without an audio track. Exercises the auto-degrade path.""" + d = tmp_path_factory.mktemp("synth_vonly") + out = d / "v_only.mp4" + _ffmpeg( + "-f", "lavfi", "-i", "testsrc2=size=1920x1080:rate=24:duration=30", + "-an", + "-c:v", "libx264", "-preset", "ultrafast", "-pix_fmt", "yuv420p", + "-t", "30", + str(out), + ) + return out + + +@pytest.fixture(scope="session") +def synth_voice(tmp_path_factory) -> Path: + """5s 880Hz sine — drop-in per-segment voice clip.""" + d = tmp_path_factory.mktemp("synth_voice") + out = d / "voice.wav" + _ffmpeg( + "-f", "lavfi", "-i", "sine=frequency=880:duration=5", + "-ar", "48000", "-ac", "2", + str(out), + ) + return out + + +# --------------------------------------------------------------------------- +# Helpers for crafting SRT / plan files inside a test's tmp_path +# --------------------------------------------------------------------------- + + +def fmt_ts(s: float) -> str: + total_ms = int(round(s * 1000)) + h, rem = divmod(total_ms, 3600_000) + m, rem = divmod(rem, 60_000) + sec, ms = divmod(rem, 1000) + return f"{h:02d}:{m:02d}:{sec:02d},{ms:03d}" + + +def write_srt(path: Path, cues: list[tuple[int, float, float, str]], + encoding: str = "utf-8") -> None: + """Write an SRT. cues: [(id, start_s, end_s, text)].""" + lines: list[str] = [] + for cid, s, e, t in cues: + lines.append(str(cid)) + lines.append(f"{fmt_ts(s)} --> {fmt_ts(e)}") + lines.append(t) + lines.append("") + path.write_bytes("\n".join(lines).encode(encoding)) + + +def write_plan_form_a(path: Path, + segments: list[tuple[int, float, float]]) -> None: + """Legacy array form. segments: [(id, src_start_s, src_end_s)].""" + data = [ + {"id": cid, "source_start": fmt_ts(s), "source_end": fmt_ts(e)} + for cid, s, e in segments + ] + path.write_text(json.dumps(data, indent=2), encoding="utf-8") + + +def write_plan_form_b(path: Path, sources: dict[str, str], + segments: list[dict], + voices: dict[str, str] | None = None) -> None: + """Object form with multi-source / multi-voice support.""" + data: dict = {"sources": sources, "segments": segments} + if voices: + data["voices"] = voices + path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") + + +@pytest.fixture +def helpers_ns(): + """Convenience: bundle the helpers module + write_* functions in one object.""" + import srt_driven_edit as sde + + class NS: + pass + + ns = NS() + ns.sde = sde + ns.write_srt = write_srt + ns.write_plan_form_a = write_plan_form_a + ns.write_plan_form_b = write_plan_form_b + ns.fmt_ts = fmt_ts + return ns diff --git a/tests/test_srt_driven_batch.py b/tests/test_srt_driven_batch.py new file mode 100644 index 0000000..47ad264 --- /dev/null +++ b/tests/test_srt_driven_batch.py @@ -0,0 +1,207 @@ +"""Batch-manifest tests for srt_driven_edit. + +Exercises load_manifest + job_from_dict + run_job in the loop pattern that +the CLI uses, without depending on argv parsing. +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +import pytest + + +# --------------------------------------------------------------------------- +# Common cue/plan helpers used across batch jobs +# --------------------------------------------------------------------------- + +CUES_2 = [ + (1, 0.0, 2.0, "alpha"), + (2, 2.0, 4.0, "beta"), +] + +PLAN_2 = [ + (1, 1.0, 3.0), + (2, 5.0, 7.0), +] + + +def default_args_namespace() -> argparse.Namespace: + """Build the defaults Namespace job_from_dict expects.""" + return argparse.Namespace( + bg_volume=0.0, + tolerance=0.5, + trim_direction="tail", + on_short="error", + style="auto", + no_cache=False, + keep_intermediates=False, + no_overwrite=False, + ) + + +def run_batch(helpers_ns, manifest_path, ffmpeg_version, *, + continue_on_error: bool = False) -> list[dict]: + """Mirror the CLI's batch loop so we can unit-test it.""" + sde = helpers_ns.sde + defaults = default_args_namespace() + rows = sde.load_manifest(manifest_path) + results: list[dict] = [] + for i, row in enumerate(rows): + try: + job = sde.job_from_dict(row, defaults, manifest_path.parent, i) + except SystemExit as e: + if continue_on_error: + results.append({ + "job": row.get("name", f"row{i}"), + "ok": False, + "error": str(e), + }) + continue + raise + try: + results.append(sde.run_job(job, ffmpeg_version)) + except SystemExit as e: + if continue_on_error: + results.append({"job": job.name, "ok": False, "error": str(e)}) + continue + raise + return results + + +@pytest.fixture +def ffmpeg_version(helpers_ns) -> str: + return helpers_ns.sde.preflight()["ffmpeg"] + + +# --------------------------------------------------------------------------- +# 1. Two jobs same name, no output specified → auto-isolated outputs +# --------------------------------------------------------------------------- + + +def test_batch_auto_isolation(helpers_ns, ffmpeg_version, synth_av, tmp_path): + # Two SRTs / plans with distinct content but identical job name + for i in range(2): + srt = tmp_path / f"script_{i}.srt" + plan = tmp_path / f"plan_{i}.json" + helpers_ns.write_srt(srt, CUES_2) + helpers_ns.write_plan_form_a(plan, PLAN_2) + + manifest_path = tmp_path / "jobs.json" + manifest_path.write_text(json.dumps([ + {"name": "promo", # same name on purpose + "source": str(synth_av), + "srt": "script_0.srt", + "plan": "plan_0.json"}, + {"name": "promo", # collision + "source": str(synth_av), + "srt": "script_1.srt", + "plan": "plan_1.json"}, + ]), encoding="utf-8") + + results = run_batch(helpers_ns, manifest_path, ffmpeg_version) + assert len(results) == 2 + assert all(r["ok"] for r in results) + + out_paths = [Path(r["output_path"]) for r in results] + # auto-isolated → distinct + assert out_paths[0] != out_paths[1] + # Names should contain the index suffix + assert "_00" in out_paths[0].name + assert "_01" in out_paths[1].name + for p in out_paths: + assert p.exists() + + +# --------------------------------------------------------------------------- +# 2. continue-on-error skips a malformed row, finishes the rest +# --------------------------------------------------------------------------- + + +def test_batch_continue_on_error(helpers_ns, ffmpeg_version, synth_av, tmp_path): + # Three jobs: 0 ok, 1 has a missing 'plan' field, 2 ok + for i in (0, 2): + helpers_ns.write_srt(tmp_path / f"s{i}.srt", CUES_2) + helpers_ns.write_plan_form_a(tmp_path / f"p{i}.json", PLAN_2) + + manifest_path = tmp_path / "jobs.json" + manifest_path.write_text(json.dumps([ + {"name": "ok0", "source": str(synth_av), + "srt": "s0.srt", "plan": "p0.json"}, + {"name": "broken", "source": str(synth_av), + "srt": "s_missing.srt"}, # no plan, srt also missing + {"name": "ok2", "source": str(synth_av), + "srt": "s2.srt", "plan": "p2.json"}, + ]), encoding="utf-8") + + results = run_batch(helpers_ns, manifest_path, ffmpeg_version, + continue_on_error=True) + assert len(results) == 3 + assert results[0]["ok"] is True + assert results[1]["ok"] is False and "error" in results[1] + assert results[2]["ok"] is True + + +def test_batch_aborts_without_continue_on_error( + helpers_ns, ffmpeg_version, synth_av, tmp_path +): + helpers_ns.write_srt(tmp_path / "s0.srt", CUES_2) + helpers_ns.write_plan_form_a(tmp_path / "p0.json", PLAN_2) + + manifest_path = tmp_path / "jobs.json" + manifest_path.write_text(json.dumps([ + {"name": "ok0", "source": str(synth_av), + "srt": "s0.srt", "plan": "p0.json"}, + {"name": "broken", "source": str(synth_av), + "srt": "s_missing.srt"}, # no plan + ]), encoding="utf-8") + + with pytest.raises(SystemExit): + run_batch(helpers_ns, manifest_path, ffmpeg_version, + continue_on_error=False) + + +# --------------------------------------------------------------------------- +# 3. CSV manifest is supported +# --------------------------------------------------------------------------- + + +def test_batch_csv_manifest(helpers_ns, ffmpeg_version, synth_av, tmp_path): + helpers_ns.write_srt(tmp_path / "s.srt", CUES_2) + helpers_ns.write_plan_form_a(tmp_path / "p.json", PLAN_2) + + manifest = tmp_path / "jobs.csv" + manifest.write_text( + "name,source,srt,plan,bg_volume\n" + f"promo,{synth_av},s.srt,p.json,0.0\n", + encoding="utf-8", + ) + results = run_batch(helpers_ns, manifest, ffmpeg_version) + assert len(results) == 1 and results[0]["ok"] is True + + +# --------------------------------------------------------------------------- +# 4. Different bg_volume per job is honored (cache must NOT collide) +# --------------------------------------------------------------------------- + + +def test_batch_per_job_bg_volume(helpers_ns, ffmpeg_version, synth_av, tmp_path): + helpers_ns.write_srt(tmp_path / "s.srt", CUES_2) + helpers_ns.write_plan_form_a(tmp_path / "p.json", PLAN_2) + + manifest = tmp_path / "jobs.json" + manifest.write_text(json.dumps([ + {"name": "silent", "source": str(synth_av), + "srt": "s.srt", "plan": "p.json", "bg_volume": 0.0}, + {"name": "bg10", "source": str(synth_av), + "srt": "s.srt", "plan": "p.json", "bg_volume": 0.1}, + ]), encoding="utf-8") + + results = run_batch(helpers_ns, manifest, ffmpeg_version) + assert len(results) == 2 and all(r["ok"] for r in results) + assert results[0]["audio"]["mode"] == "silent" + assert results[1]["audio"]["mode"] == "original_only" + # bg10 should NOT have hit cache from silent (different effective_bg → different key) + assert all(s["cached"] is False for s in results[1]["segments"]) diff --git a/tests/test_srt_driven_e2e.py b/tests/test_srt_driven_e2e.py new file mode 100644 index 0000000..d33b9a8 --- /dev/null +++ b/tests/test_srt_driven_e2e.py @@ -0,0 +1,358 @@ +"""End-to-end tests for srt_driven_edit. + +Each test crafts an SRT + plan file inside tmp_path, runs run_job against +the session-scoped synthetic source video, and verifies output existence, +duration accuracy (within 200ms), and QC report contents. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + + +# --------------------------------------------------------------------------- +# Test helpers +# --------------------------------------------------------------------------- + +DEFAULT_CUES = [ + (1, 0.0, 2.0, "first cue"), + (2, 2.0, 4.5, "second cue"), + (3, 6.0, 8.5, "third cue with leading gap"), # 1.5s gap before this +] + +DEFAULT_PLAN = [ + (1, 1.0, 3.0), # 2.0s from source[1.0-3.0] + (2, 5.0, 7.5), # 2.5s + (3, 10.0, 12.5), # 2.5s +] + + +def make_job(helpers_ns, srt_path, plan_path, tmp_path, *, + source=None, voice=None, bg_volume=0.0, + style="auto", no_overwrite=False, output=None): + sde = helpers_ns.sde + return sde.Job( + source=source, + srt=srt_path, + plan=plan_path, + voice=voice, + bg_volume=bg_volume, + tolerance=0.5, + trim_direction="tail", + on_short="error", + style=style, + fontsdir=None, + output=output or (tmp_path / "out.mp4"), + name=srt_path.stem, + no_cache=False, + keep_intermediates=False, + no_overwrite=no_overwrite, + ) + + +@pytest.fixture +def ffmpeg_version(helpers_ns) -> str: + return helpers_ns.sde.preflight()["ffmpeg"] + + +# --------------------------------------------------------------------------- +# 1. Basic e2e: source.mp4 + 3 cues → final has expected duration +# --------------------------------------------------------------------------- + + +def test_basic_single_job(helpers_ns, ffmpeg_version, synth_av, tmp_path): + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av) + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + assert qc["ok"] is True + assert qc["duration"]["expected_s"] == 8.5 + assert abs(qc["duration"]["drift_ms"]) <= 200 + assert (tmp_path / "out.mp4").exists() + assert qc["audio"]["mode"] == "silent" # bg_volume=0, no voice + + +# --------------------------------------------------------------------------- +# 2. GB18030 SRT input — encoding fallback must let the pipeline complete +# --------------------------------------------------------------------------- + + +def test_gbk_srt_input(helpers_ns, ffmpeg_version, synth_av, tmp_path): + srt = tmp_path / "script_gbk.srt" + plan = tmp_path / "plan.json" + cjk_cues = [ + (1, 0.0, 2.0, "第一条"), + (2, 2.0, 4.5, "第二条"), + (3, 6.0, 8.5, "第三条 含 gap"), + ] + helpers_ns.write_srt(srt, cjk_cues, encoding="gb18030") + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av, style="auto") + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + assert qc["ok"] is True + assert "Microsoft YaHei UI" in qc["subtitles"]["force_style"], \ + "auto style should pick cjk-natural when SRT contains CJK" + + +# --------------------------------------------------------------------------- +# 3. CJK in output path — work_dir + ensure_safe_subs_path must save us +# --------------------------------------------------------------------------- + + +def test_cjk_in_output_path(helpers_ns, ffmpeg_version, synth_av, tmp_path): + cjk_dir = tmp_path / "中文 目录" + cjk_dir.mkdir() + srt = cjk_dir / "字幕.srt" + plan = cjk_dir / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + out = cjk_dir / "成片.mp4" + job = make_job(helpers_ns, srt, plan, tmp_path, + source=synth_av, output=out) + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + assert qc["ok"] is True + assert out.exists() + + +# --------------------------------------------------------------------------- +# 4. Per-segment voice — audio.mode should reflect voice usage +# --------------------------------------------------------------------------- + + +def test_per_segment_voice(helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path): + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + + helpers_ns.write_plan_form_b( + plan, + sources={"A": str(synth_av)}, + voices={"v1": str(synth_voice)}, + segments=[ + {"id": 1, "source": "A", "source_start": "00:00:01,000", + "source_end": "00:00:03,000", "voice": "v1"}, + {"id": 2, "source": "A", "source_start": "00:00:05,000", + "source_end": "00:00:07,500"}, + {"id": 3, "source": "A", "source_start": "00:00:10,000", + "source_end": "00:00:12,500"}, + ], + ) + + job = make_job(helpers_ns, srt, plan, tmp_path) # source=None — Form B + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + assert qc["ok"] is True + assert qc["audio"]["voice_used"] is True + assert qc["audio"]["mode"] == "voice_replace" # bg_volume == 0 + + +# --------------------------------------------------------------------------- +# 5. Video-only source + bg_volume > 0 → auto-degrade, no crash +# --------------------------------------------------------------------------- + + +def test_video_only_source_with_bg_volume( + helpers_ns, ffmpeg_version, synth_v_only, tmp_path, capsys +): + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, + source=synth_v_only, bg_volume=0.5) + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + captured = capsys.readouterr() + assert "no audio track" in captured.out, \ + "expected a WARNING about source having no audio" + assert qc["ok"] is True + + +# --------------------------------------------------------------------------- +# 6. Source range out of bounds → SystemExit before extraction +# --------------------------------------------------------------------------- + + +def test_range_out_of_bounds_fails_fast( + helpers_ns, ffmpeg_version, synth_av, tmp_path +): + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + # source is 30s, but ask for 0:50 — way over + helpers_ns.write_plan_form_a(plan, [ + (1, 1.0, 3.0), + (2, 5.0, 7.5), + (3, 50.0, 52.5), # bad + ]) + + job = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av) + with pytest.raises(SystemExit) as exc: + helpers_ns.sde.run_job(job, ffmpeg_version) + assert "exceeds source" in str(exc.value) + # And the failure happened pre-extract, so no out.mp4 + assert not (tmp_path / "out.mp4").exists() + + +# --------------------------------------------------------------------------- +# 7. Second run hits cache for every segment +# --------------------------------------------------------------------------- + + +def test_cache_hit_on_rerun(helpers_ns, ffmpeg_version, synth_av, tmp_path): + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av) + qc1 = helpers_ns.sde.run_job(job, ffmpeg_version) + qc2 = helpers_ns.sde.run_job(job, ffmpeg_version) + + assert all(s["cached"] is False for s in qc1["segments"]) + assert all(s["cached"] is True for s in qc2["segments"]) + # Cache hits should be measurably faster + assert qc2["elapsed_s"] <= qc1["elapsed_s"] + + +# --------------------------------------------------------------------------- +# 8. --no-overwrite refuses to clobber existing output +# --------------------------------------------------------------------------- + + +def test_no_overwrite_refuses(helpers_ns, ffmpeg_version, synth_av, tmp_path): + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job1 = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av) + helpers_ns.sde.run_job(job1, ffmpeg_version) + + job2 = make_job(helpers_ns, srt, plan, tmp_path, + source=synth_av, no_overwrite=True) + with pytest.raises(SystemExit) as exc: + helpers_ns.sde.run_job(job2, ffmpeg_version) + assert "no-overwrite" in str(exc.value) + + +# --------------------------------------------------------------------------- +# 9. SRT gap → output duration includes the gap as black+silent +# --------------------------------------------------------------------------- + + +def test_global_voice_spans_timeline( + helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path +): + """Global --voice must span the WHOLE output timeline, not restart per segment. + + Regression: earlier implementation expanded --voice into a synthetic + per-segment voice on every entry, which made each segment apad/atrim + voice.wav from t=0 — so a 5s voice would replay at every cut. The fix + moves global-voice mixing into the final compose step where voice is + apad'd / atrim'd to total_duration once. + """ + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) # total 8.5s + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, + source=synth_av, voice=synth_voice) + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + assert qc["ok"] is True + assert qc["audio"]["voice_used"] is True + assert qc["audio"]["mode"] == "voice_replace" + assert qc["audio"]["bg_volume"] == 0.0 + # Per-segment voice slot must be None — proves we are NOT smuggling the + # global voice in via the per-segment expansion hack. + assert all(s["voice"] is None for s in qc["segments"]) + + # Output duration matches SRT total (voice apad'd from 5s → 8.5s) + actual = helpers_ns.sde.probe_duration(tmp_path / "out.mp4") + assert abs(actual - 8.5) < 0.25, f"actual {actual}s vs expected 8.5s" + + +def test_global_voice_with_bg_volume_mix( + helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path +): + """With bg_volume>0 and global voice, base audio (source*bg) is mixed + under voice. The bg_volume is applied ONCE at extract; the final compose + must not re-scale it. + """ + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, + source=synth_av, voice=synth_voice, bg_volume=0.1) + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + assert qc["ok"] is True + assert qc["audio"]["mode"] == "voice_mix" + assert qc["audio"]["bg_volume"] == 0.1 + + +def test_global_voice_cache_independence( + helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path +): + """Segment cache must NOT depend on the global voice file. Running once + without voice then again with voice should reuse all segment caches — + voice gets mixed in the final pass, segments are identical. + """ + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job_no_voice = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av) + qc1 = helpers_ns.sde.run_job(job_no_voice, ffmpeg_version) + + job_with_voice = make_job( + helpers_ns, srt, plan, tmp_path, + source=synth_av, voice=synth_voice, + output=tmp_path / "out_voiced.mp4", + ) + qc2 = helpers_ns.sde.run_job(job_with_voice, ffmpeg_version) + + assert all(s["cached"] is False for s in qc1["segments"]), \ + "first run should not have cache hits" + assert all(s["cached"] is True for s in qc2["segments"]), \ + "second run with global voice should hit segment cache — voice is " \ + "mixed in the final pass, not baked into segments" + + +def test_gap_inserted_in_output(helpers_ns, ffmpeg_version, synth_av, tmp_path): + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + # 2 cues with a 1.5s gap between them: total output = 2 + 1.5 + 2.5 = 6.0s + cues = [ + (1, 0.0, 2.0, "first"), + (2, 3.5, 6.0, "second after gap"), + ] + helpers_ns.write_srt(srt, cues) + helpers_ns.write_plan_form_a(plan, [(1, 1.0, 3.0), (2, 5.0, 7.5)]) + + job = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av) + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + assert qc["ok"] is True + assert qc["duration"]["expected_s"] == 6.0 + assert abs(qc["duration"]["drift_ms"]) <= 200 + # ffprobe the actual output to double-check + actual = helpers_ns.sde.probe_duration(tmp_path / "out.mp4") + assert abs(actual - 6.0) < 0.25, f"actual {actual}s, expected 6.0s" From c9800e3de39440611c24332e4acdb233fab11ff8 Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Tue, 19 May 2026 21:22:09 +0800 Subject: [PATCH 02/18] feat(recommend_edit_plan): suggest edit_plan.json from script + transcript MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bridges the gap between Scribe word-level transcripts and the srt_driven_edit pipeline. Given a final-cut script.srt and a source recording's Scribe JSON, produces an edit_plan.json (Form A or B) plus a sidecar review markdown for human-in-the-loop QA. Matching strategy is intentionally local (no LLM, no API): 1. Filter the transcript to timestamped 'word' tokens (audio_event / spacing skipped; --keep-audio-events keeps markers as context). 2. Group consecutive words into non-overlapping candidates, breaking on sentence-end punctuation, silences >= gap_threshold, or speaker change. Long candidates split at phrase punctuation, then by hard word-level windows. All edges land on word boundaries. 3. Score each (cue, candidate) pair as 0.7 * (0.6 * SequenceMatcher + 0.4 * Jaccard) + 0.3 * 1/(1+|dur_delta|/cue_dur) where Jaccard auto-switches between Latin word-token and CJK character-bigram representations. 4. Greedy assignment; --allow-reuse drops the no-reuse constraint. 5. Emit Form A (default, drop-in for srt_driven_edit --plan) or Form B; review markdown lists matched text, score, duration delta, and warnings (low score / duration mismatch / candidate-shorter-than- cue). Hard failure modes (exit 1): any cue with no assignable candidate; malformed transcript JSON; transcript with no word tokens. Soft failures (warnings only): low score, candidate too short for cue. The matcher cannot understand storyline — if SRT narration words do not appear in the source transcript, scores will be low. The sidecar review.md is the manual QA surface; it is intentionally not pulled into the plan (parse_plan in srt_driven_edit stays strict). --packed (takes_packed.md) and --context-window flags are reserved placeholders only; both raise no error but do not yet alter behavior. Includes 11 pytest tests including a full end-to-end: recommend -> sde.run_job -> final.mp4 against lavfi-synthesized media. Co-Authored-By: Claude Opus 4.7 --- helpers/recommend_edit_plan.py | 561 ++++++++++++++++++++++++++++++ tests/test_recommend_edit_plan.py | 366 +++++++++++++++++++ 2 files changed, 927 insertions(+) create mode 100644 helpers/recommend_edit_plan.py create mode 100644 tests/test_recommend_edit_plan.py diff --git a/helpers/recommend_edit_plan.py b/helpers/recommend_edit_plan.py new file mode 100644 index 0000000..14b2c8a --- /dev/null +++ b/helpers/recommend_edit_plan.py @@ -0,0 +1,561 @@ +"""Recommend an edit_plan.json from script.srt + source transcript. + +Pipeline position: + script.srt + transcript.json + --(this script)--> + edit_plan.json + edit_plan_review.md + --(srt_driven_edit.py)--> + final.mp4 + +Matching is best-effort LEXICAL (no LLM, no semantic understanding): + 1. Parse Scribe JSON → keep only timestamped 'word' tokens. Without + word-level start/end timestamps we cannot produce reliable + source_start / source_end, so plain-text transcripts are not usable. + 2. Build candidate ranges by breaking on sentence-end punctuation, + silences ≥ gap_threshold, or speaker change; split long candidates + at phrase punctuation then by hard word-level windows. + 3. For each SRT cue, score every candidate by: + 0.6 * SequenceMatcher(normalized chars) + + 0.4 * Jaccard (token-level for Latin / 2-gram for CJK) + blended with duration similarity at 0.7 / 0.3. + The matcher cannot understand storyline — if the SRT narration uses + words not present in the source transcript, scores will be low and + matches will need manual review. + 4. Greedy assignment, no reuse unless --allow-reuse. + 5. Emit Form-A or Form-B plan + a sidecar review markdown. + +Reserved CLI flags (placeholders, not yet wired up): + --packed takes_packed.md input (use --transcript for now) + --context-window padding around matched ranges + +Usage: + python helpers/recommend_edit_plan.py \\ + --script script.srt \\ + --transcript edit/transcripts/source.json \\ + --source source.mp4 \\ + -o edit_plan.json + python helpers/srt_driven_edit.py \\ + --source source.mp4 --srt script.srt --plan edit_plan.json -o final.mp4 +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +from dataclasses import dataclass, field +from difflib import SequenceMatcher +from pathlib import Path + +try: + from srt_driven_edit import ( + parse_srt as _parse_srt, + format_srt_ts, + CJK_RE, + SrtCue, # only for type hints + ) +except Exception as e: + raise SystemExit( + "recommend_edit_plan: failed to import from srt_driven_edit.py. " + f"Both files must be importable from the same helpers/ dir. ({e})" + ) + + +# ============================================================================ +# Candidate parsing +# ============================================================================ + + +SENT_END_PUNCT = set(".?!。?!") +PHRASE_PUNCT = set(",;:,;:、") + + +@dataclass +class Candidate: + start: float + end: float + text: str + + @property + def duration(self) -> float: + return self.end - self.start + + +def load_transcript_words(path: Path, keep_audio_events: bool = False) -> list[dict]: + """Return Scribe word tokens with valid timestamps. Optionally keep audio events.""" + try: + data = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as e: + raise SystemExit(f"transcript not valid JSON: {path}: {e}") + words = data.get("words") + if not isinstance(words, list): + raise SystemExit(f"transcript missing 'words' list: {path}") + out: list[dict] = [] + for w in words: + wt = w.get("type") + if wt == "word": + if w.get("start") is None or w.get("end") is None: + continue + out.append(w) + elif wt == "audio_event" and keep_audio_events: + out.append(w) + if not out: + raise SystemExit(f"transcript has no usable word tokens: {path}") + return out + + +def _join_words(words: list[dict]) -> str: + """Concatenate word texts. Single space between. CJK joiners are removed + again at normalize time so this is safe even when neighbors are Chinese.""" + return " ".join((w.get("text") or "").strip() for w in words if (w.get("text") or "").strip()) + + +def _hard_split(part: list[dict], max_dur: float) -> list[Candidate]: + """Walk word-by-word, close a chunk as soon as adding the next word would + exceed max_dur. Every emitted chunk lands on a word boundary by construction. + """ + out: list[Candidate] = [] + chunk: list[dict] = [] + cs = float(part[0]["start"]) + for w in part: + we = float(w["end"]) + if chunk and (we - cs) > max_dur: + ce = float(chunk[-1]["end"]) + out.append(Candidate(cs, ce, _join_words(chunk))) + chunk = [] + cs = float(w["start"]) + chunk.append(w) + if chunk: + out.append(Candidate(cs, float(chunk[-1]["end"]), _join_words(chunk))) + return out + + +def build_candidates( + words: list[dict], + *, + gap_threshold: float = 0.5, + max_dur: float = 12.0, + min_dur: float = 0.4, +) -> list[Candidate]: + """Group words into phrase-level candidates. Non-overlapping by construction.""" + # Step 1: raw groups by sentence-end punct / silence / speaker change + raw_groups: list[list[dict]] = [] + current: list[dict] = [] + prev_end: float | None = None + prev_speaker: str | None = None + for w in words: + if w.get("type") != "word": + continue + text = (w.get("text") or "").strip() + if not text: + continue + ws = float(w["start"]) + we = float(w["end"]) + speaker = w.get("speaker_id") + if prev_speaker is not None and speaker is not None and speaker != prev_speaker: + if current: + raw_groups.append(current); current = [] + if prev_end is not None and (ws - prev_end) >= gap_threshold: + if current: + raw_groups.append(current); current = [] + current.append(w) + prev_end = we + prev_speaker = speaker + if text[-1] in SENT_END_PUNCT: + raw_groups.append(current); current = [] + if current: + raw_groups.append(current) + + # Step 2: split groups that exceed max_dur — phrase punct first, then hard + out: list[Candidate] = [] + for group in raw_groups: + if not group: + continue + start = float(group[0]["start"]) + end = float(group[-1]["end"]) + if end - start <= max_dur: + out.append(Candidate(start, end, _join_words(group))) + continue + parts: list[list[dict]] = [] + buf: list[dict] = [] + for w in group: + buf.append(w) + text = (w.get("text") or "").strip() + if text and text[-1] in PHRASE_PUNCT: + parts.append(buf); buf = [] + if buf: + parts.append(buf) + for part in parts: + ps = float(part[0]["start"]); pe = float(part[-1]["end"]) + if pe - ps <= max_dur: + out.append(Candidate(ps, pe, _join_words(part))) + else: + out.extend(_hard_split(part, max_dur)) + + return [c for c in out if c.duration >= min_dur] + + +# ============================================================================ +# Scoring +# ============================================================================ + + +# Keep word characters, whitespace, and CJK ranges; replace everything else +# (punctuation, brackets, audio-event markers) with a space. +_NORMALIZE_RE = re.compile( + r"[^\w\s一-鿿㐀-䶿぀-ゟ゠-ヿ가-힯]+", + flags=re.UNICODE, +) +_WS_RE = re.compile(r"\s+") + + +def normalize_text(text: str) -> str: + s = text.casefold() + s = _NORMALIZE_RE.sub(" ", s) + s = _WS_RE.sub(" ", s).strip() + return s + + +def is_cjk_heavy(text: str) -> bool: + """True if at least half of the non-whitespace characters are CJK.""" + chars = [c for c in text if not c.isspace()] + if not chars: + return False + cjk = sum(1 for c in chars if CJK_RE.match(c)) + return cjk * 2 >= len(chars) + + +def _tokens(text: str) -> list[str]: + return text.split() + + +def _char_bigrams(text: str) -> set[str]: + chars = [c for c in text if not c.isspace()] + return {"".join(chars[i:i + 2]) for i in range(len(chars) - 1)} + + +def _jaccard(a: set | list, b: set | list) -> float: + sa, sb = set(a), set(b) + if not sa and not sb: + return 1.0 + if not sa or not sb: + return 0.0 + return len(sa & sb) / len(sa | sb) + + +def text_similarity(cue_text: str, cand_text: str) -> float: + """Blend of SequenceMatcher (local structure) and Jaccard (bag of units).""" + a = normalize_text(cue_text) + b = normalize_text(cand_text) + if not a or not b: + return 0.0 + seq = SequenceMatcher(None, a, b, autojunk=False).ratio() + if is_cjk_heavy(a) or is_cjk_heavy(b): + jc = _jaccard(_char_bigrams(a), _char_bigrams(b)) + else: + jc = _jaccard(_tokens(a), _tokens(b)) + return 0.6 * seq + 0.4 * jc + + +def duration_similarity(cand_dur: float, cue_dur: float) -> float: + if cue_dur <= 0: + return 0.0 + delta = abs(cand_dur - cue_dur) + return 1.0 / (1.0 + delta / cue_dur) + + +def combined_score(cue: SrtCue, cand: Candidate, + w_text: float = 0.7, w_dur: float = 0.3) -> float: + return ( + w_text * text_similarity(cue.text, cand.text) + + w_dur * duration_similarity(cand.duration, cue.duration) + ) + + +# ============================================================================ +# Assignment +# ============================================================================ + + +@dataclass +class Assignment: + cue_id: int + cue_text: str + cue_duration: float + cand: Candidate | None + score: float + warnings: list[str] = field(default_factory=list) + + +def assign( + cues: list[SrtCue], + candidates: list[Candidate], + *, + allow_reuse: bool = False, + min_score: float = 0.35, + duration_warn_ratio: float = 0.5, +) -> list[Assignment]: + used: set[int] = set() + out: list[Assignment] = [] + for cue in cues: + best_idx = -1 + best_score = -1.0 + for i, cand in enumerate(candidates): + if not allow_reuse and i in used: + continue + s = combined_score(cue, cand) + if s > best_score: + best_score = s + best_idx = i + warns: list[str] = [] + cand_out: Candidate | None = None + if best_idx < 0: + warns.append("no candidate available") + score_out = 0.0 + else: + cand_out = candidates[best_idx] + score_out = best_score + if not allow_reuse: + used.add(best_idx) + if best_score < min_score: + warns.append(f"low score {best_score:.3f} < {min_score}") + if cue.duration > 0: + dd_ratio = abs(cand_out.duration - cue.duration) / cue.duration + if dd_ratio > duration_warn_ratio: + warns.append( + f"duration mismatch: cand {cand_out.duration:.2f}s vs " + f"cue {cue.duration:.2f}s ({dd_ratio:.0%} off)" + ) + if cand_out.duration + 1e-6 < cue.duration: + warns.append( + "candidate shorter than cue — will need `--on-short pad` " + "in srt_driven_edit" + ) + out.append(Assignment( + cue_id=cue.id, cue_text=cue.text, cue_duration=cue.duration, + cand=cand_out, score=score_out, warnings=warns, + )) + return out + + +# ============================================================================ +# Output writers +# ============================================================================ + + +def _require_all_assigned(assignments: list[Assignment]) -> None: + missing = [a.cue_id for a in assignments if a.cand is None] + if missing: + raise SystemExit( + f"no candidate found for cue(s) {missing}. " + "Add transcript coverage, lower --gap-threshold, or pass --allow-reuse." + ) + + +def write_plan_form_a(assignments: list[Assignment], out_path: Path) -> None: + _require_all_assigned(assignments) + rows = [ + { + "id": a.cue_id, + "source_start": format_srt_ts(a.cand.start), + "source_end": format_srt_ts(a.cand.end), + } + for a in assignments + ] + out_path.write_text(json.dumps(rows, indent=2, ensure_ascii=False), encoding="utf-8") + + +def write_plan_form_b( + assignments: list[Assignment], + source_path: Path, + source_name: str, + out_path: Path, +) -> None: + _require_all_assigned(assignments) + data = { + "sources": {source_name: str(source_path)}, + "segments": [ + { + "id": a.cue_id, + "source": source_name, + "source_start": format_srt_ts(a.cand.start), + "source_end": format_srt_ts(a.cand.end), + } + for a in assignments + ], + } + out_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") + + +def write_review(assignments: list[Assignment], out_path: Path) -> None: + lines: list[str] = ["# Edit plan review", ""] + total = len(assignments) + matched = sum(1 for a in assignments if a.cand is not None) + warned = sum(1 for a in assignments if a.warnings) + avg = (sum(a.score for a in assignments if a.cand) / max(matched, 1)) + lines.append(f"- total cues: {total}") + lines.append(f"- matched: {matched}/{total}") + lines.append(f"- with warnings: {warned}") + lines.append(f"- average score: {avg:.3f}") + lines.append("") + for a in assignments: + lines.append(f"## cue id={a.cue_id}") + lines.append(f"- **cue text**: {a.cue_text!r}") + lines.append(f"- **cue duration**: {a.cue_duration:.3f}s") + if a.cand is None: + lines.append("- **match**: NONE") + else: + lines.append(f"- **matched text**: {a.cand.text!r}") + lines.append( + f"- **source range**: {format_srt_ts(a.cand.start)} → " + f"{format_srt_ts(a.cand.end)} ({a.cand.duration:.3f}s)" + ) + lines.append(f"- **score**: {a.score:.3f}") + dd = a.cand.duration - a.cue_duration + lines.append(f"- **duration delta**: {dd:+.3f}s") + for w in a.warnings: + lines.append(f"- **WARNING**: {w}") + lines.append("") + out_path.write_text("\n".join(lines), encoding="utf-8") + + +# ============================================================================ +# Top-level callable (used by CLI and tests) +# ============================================================================ + + +def recommend( + *, + script_srt: Path, + transcript: Path, + source: Path, + output: Path, + review: Path | None = None, + source_name: str = "A", + output_format: str = "form-a", + gap_threshold: float = 0.5, + max_cand_dur: float = 12.0, + min_cand_dur: float = 0.4, + min_score: float = 0.35, + allow_reuse: bool = False, + keep_audio_events: bool = False, +) -> list[Assignment]: + cues = _parse_srt(script_srt) + if not cues: + raise SystemExit(f"script.srt has no cues: {script_srt}") + + words = load_transcript_words(transcript, keep_audio_events=keep_audio_events) + candidates = build_candidates( + words, + gap_threshold=gap_threshold, + max_dur=max_cand_dur, + min_dur=min_cand_dur, + ) + if not candidates: + raise SystemExit( + f"no candidates built from transcript {transcript}. " + "Try lowering --min-cand-dur or check transcript quality." + ) + + assignments = assign( + cues, candidates, + allow_reuse=allow_reuse, min_score=min_score, + ) + + if output_format == "form-a": + write_plan_form_a(assignments, output) + elif output_format == "form-b": + write_plan_form_b(assignments, source, source_name, output) + else: + raise SystemExit(f"unknown --format: {output_format}") + + if review is None: + review = output.with_name(output.stem + "_review.md") + write_review(assignments, review) + return assignments + + +# ============================================================================ +# CLI +# ============================================================================ + + +def main() -> None: + ap = argparse.ArgumentParser( + description="Recommend edit_plan.json from script.srt + Scribe transcript", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=( + "Example:\n" + " python helpers/recommend_edit_plan.py \\\n" + " --script script.srt \\\n" + " --transcript edit/transcripts/source.json \\\n" + " --source source.mp4 \\\n" + " -o edit_plan.json\n" + " python helpers/srt_driven_edit.py \\\n" + " --source source.mp4 --srt script.srt --plan edit_plan.json -o final.mp4" + ), + ) + ap.add_argument("--script", type=Path, required=True, + help="script.srt (target captions timeline)") + ap.add_argument("--transcript", type=Path, required=True, + help="Scribe transcript JSON") + ap.add_argument("--source", type=Path, required=True, + help="source.mp4 path (recorded in Form-B plans)") + ap.add_argument("--packed", type=Path, default=None, + help="optional takes_packed.md (reserved; unused in v1)") + ap.add_argument("--source-name", default="A", + help="Form-B source name (default 'A')") + ap.add_argument("--context-window", type=float, default=1.5, + help="reserved for future use") + ap.add_argument("--gap-threshold", type=float, default=0.5, + help="silence gap (s) that breaks a candidate. default 0.5") + ap.add_argument("--max-cand-dur", type=float, default=12.0, + help="max candidate duration before forced split. default 12.0") + ap.add_argument("--min-cand-dur", type=float, default=0.4, + help="drop candidates shorter than this. default 0.4") + ap.add_argument("--min-score", type=float, default=0.35, + help="score below this triggers a warning. default 0.35") + ap.add_argument("--allow-reuse", action="store_true", + help="allow one candidate to be assigned to multiple cues") + ap.add_argument("--keep-audio-events", action="store_true", + help="keep (laughter) (applause) tokens as candidate context") + ap.add_argument("--format", choices=["form-a", "form-b"], default="form-a", + dest="output_format") + ap.add_argument("-o", "--output", type=Path, required=True, + help="edit_plan.json path") + ap.add_argument("--review", type=Path, default=None, + help="review .md path (default: _review.md)") + args = ap.parse_args() + + assignments = recommend( + script_srt=args.script.resolve(), + transcript=args.transcript.resolve(), + source=args.source.resolve(), + output=args.output.resolve(), + review=args.review.resolve() if args.review else None, + source_name=args.source_name, + output_format=args.output_format, + gap_threshold=args.gap_threshold, + max_cand_dur=args.max_cand_dur, + min_cand_dur=args.min_cand_dur, + min_score=args.min_score, + allow_reuse=args.allow_reuse, + keep_audio_events=args.keep_audio_events, + ) + + matched = sum(1 for a in assignments if a.cand is not None) + warned = sum(1 for a in assignments if a.warnings) + avg = sum(a.score for a in assignments if a.cand is not None) / max(matched, 1) + review_path = ( + args.review.resolve() if args.review + else args.output.resolve().with_name(args.output.stem + "_review.md") + ) + print(f"wrote plan → {args.output}") + print(f"wrote review → {review_path}") + print(f" {matched}/{len(assignments)} cues matched, avg score {avg:.3f}, " + f"{warned} with warnings") + + +if __name__ == "__main__": + main() diff --git a/tests/test_recommend_edit_plan.py b/tests/test_recommend_edit_plan.py new file mode 100644 index 0000000..1c4dd0e --- /dev/null +++ b/tests/test_recommend_edit_plan.py @@ -0,0 +1,366 @@ +"""Tests for recommend_edit_plan.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + + +@pytest.fixture +def rec(): + """Convenience: import the module under test as a fixture.""" + import recommend_edit_plan as r + return r + + +@pytest.fixture +def sde(): + import srt_driven_edit as s + return s + + +def write_transcript(path: Path, words: list[dict]) -> None: + """Wrap a flat list of {text,start,end,type} dicts in a Scribe-style envelope.""" + path.write_text( + json.dumps({"language_code": "en", "words": words}, ensure_ascii=False), + encoding="utf-8", + ) + + +def write_srt_cues(path, cues, helpers_ns): + helpers_ns.write_srt(path, cues, encoding="utf-8") + + +# --------------------------------------------------------------------------- +# 1. English exact match — high score, correct range +# --------------------------------------------------------------------------- + + +def test_english_exact_match(rec, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + # Cue duration matches candidate duration exactly so duration warnings stay quiet. + helpers_ns.write_srt(srt, [ + (1, 0.0, 1.0, "Hello world"), + ]) + write_transcript(transcript, [ + {"text": "Hello", "start": 5.0, "end": 5.4, "type": "word"}, + {"text": "world.", "start": 5.4, "end": 6.0, "type": "word"}, + {"text": "Other", "start": 10.0, "end": 10.5, "type": "word"}, + ]) + + assignments = rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + ) + assert len(assignments) == 1 + a = assignments[0] + assert a.cand is not None + assert abs(a.cand.start - 5.0) < 1e-6 + assert abs(a.cand.end - 6.0) < 1e-6 + assert a.score > 0.85, f"exact-text match should score high, got {a.score}" + assert not a.warnings, f"unexpected warnings: {a.warnings}" + + +# --------------------------------------------------------------------------- +# 2. Chinese match — CJK Jaccard path +# --------------------------------------------------------------------------- + + +def test_chinese_match(rec, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [ + (1, 0.0, 3.0, "我们这季度把规划器重写了。"), + ]) + write_transcript(transcript, [ + {"text": "我们", "start": 12.0, "end": 12.4, "type": "word"}, + {"text": "这", "start": 12.4, "end": 12.5, "type": "word"}, + {"text": "季度", "start": 12.5, "end": 13.0, "type": "word"}, + {"text": "把", "start": 13.0, "end": 13.1, "type": "word"}, + {"text": "规划器", "start": 13.1, "end": 14.0, "type": "word"}, + {"text": "重写了。", "start": 14.0, "end": 15.0, "type": "word"}, + # A distractor far away + {"text": "不相关的内容。", "start": 25.0, "end": 26.0, "type": "word"}, + ]) + + assignments = rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + ) + a = assignments[0] + assert a.cand is not None + assert abs(a.cand.start - 12.0) < 1e-6 + assert abs(a.cand.end - 15.0) < 1e-6 + assert a.score > 0.7 + + +# --------------------------------------------------------------------------- +# 3. Punctuation + case differences still match +# --------------------------------------------------------------------------- + + +def test_punct_and_case_invariant(rec, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + # SRT: lowercase, no punct, matching duration + helpers_ns.write_srt(srt, [ + (1, 0.0, 2.0, "hello there friends"), + ]) + # Transcript: mixed case + phrase punct (commas keep words grouped); the + # SENTENCE-end '!' only on the last word so all three stay in one candidate. + write_transcript(transcript, [ + {"text": "HELLO,", "start": 1.0, "end": 1.5, "type": "word"}, + {"text": "There,", "start": 1.5, "end": 2.0, "type": "word"}, + {"text": "FRIENDS!", "start": 2.0, "end": 3.0, "type": "word"}, + ]) + assignments = rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + ) + a = assignments[0] + assert a.cand is not None + assert a.score > 0.85, f"normalization should erase case+punct, got {a.score}" + + +# --------------------------------------------------------------------------- +# 4. Silence gap splits candidates +# --------------------------------------------------------------------------- + + +def test_silence_gap_splits(rec, tmp_path): + """Two phrases separated by a 1.0s silence should produce two candidates, + not one — even though neither phrase ends in sentence-end punctuation. + """ + transcript = tmp_path / "transcript.json" + write_transcript(transcript, [ + {"text": "alpha", "start": 1.0, "end": 1.4, "type": "word"}, + {"text": "beta", "start": 1.4, "end": 2.0, "type": "word"}, + # 1.0s silence + {"text": "gamma", "start": 3.0, "end": 3.4, "type": "word"}, + {"text": "delta", "start": 3.4, "end": 4.0, "type": "word"}, + ]) + words = rec.load_transcript_words(transcript) + candidates = rec.build_candidates(words, gap_threshold=0.5) + assert len(candidates) == 2 + assert abs(candidates[0].start - 1.0) < 1e-6 and abs(candidates[0].end - 2.0) < 1e-6 + assert abs(candidates[1].start - 3.0) < 1e-6 and abs(candidates[1].end - 4.0) < 1e-6 + # Tightening the gap shouldn't merge them (still well over threshold) + # Loosening past 1.0s should: + merged = rec.build_candidates(words, gap_threshold=1.1) + assert len(merged) == 1 + + +# --------------------------------------------------------------------------- +# 5. Low-score match emits warning +# --------------------------------------------------------------------------- + + +def test_low_score_warning(rec, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + # Cue text shares almost no tokens with any candidate + helpers_ns.write_srt(srt, [ + (1, 0.0, 2.0, "quantum entanglement decoherence"), + ]) + write_transcript(transcript, [ + {"text": "apple", "start": 1.0, "end": 1.5, "type": "word"}, + {"text": "banana", "start": 1.5, "end": 2.0, "type": "word"}, + ]) + assignments = rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + min_score=0.5, # set high to force the warning + ) + a = assignments[0] + assert a.cand is not None # still got SOME candidate + assert any("low score" in w for w in a.warnings) + + +# --------------------------------------------------------------------------- +# 6. SRT id ordering preserved in output +# --------------------------------------------------------------------------- + + +def test_ids_preserved(rec, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [ + (1, 0.0, 1.0, "alpha"), + (2, 1.0, 2.0, "beta"), + (3, 2.0, 3.0, "gamma"), + ]) + write_transcript(transcript, [ + {"text": "alpha.", "start": 1.0, "end": 1.5, "type": "word"}, + {"text": "beta.", "start": 5.0, "end": 5.5, "type": "word"}, + {"text": "gamma.", "start": 10.0, "end": 10.5, "type": "word"}, + ]) + out = tmp_path / "plan.json" + rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=out, + ) + plan_rows = json.loads(out.read_text(encoding="utf-8")) + assert [r["id"] for r in plan_rows] == [1, 2, 3] + + +# --------------------------------------------------------------------------- +# 7. Output is parseable by srt_driven_edit.parse_plan +# --------------------------------------------------------------------------- + + +def test_output_is_parseable_by_sde(rec, sde, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [ + (1, 0.0, 1.0, "alpha"), + (2, 1.0, 2.0, "beta"), + ]) + write_transcript(transcript, [ + {"text": "alpha.", "start": 1.0, "end": 1.5, "type": "word"}, + {"text": "beta.", "start": 5.0, "end": 5.5, "type": "word"}, + ]) + out = tmp_path / "plan.json" + rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=out, + ) + + sources, voices, entries = sde.parse_plan(out) + assert sources == {} and voices == {} # Form A — no maps + assert [e.id for e in entries] == [1, 2] + assert all(e.source_name == "_default" for e in entries) + assert entries[0].source_start == 1.0 and entries[0].source_end == 1.5 + assert entries[1].source_start == 5.0 and entries[1].source_end == 5.5 + + +# --------------------------------------------------------------------------- +# 8. Form B output carries the source name +# --------------------------------------------------------------------------- + + +def test_form_b_output(rec, sde, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [(1, 0.0, 1.0, "alpha")]) + write_transcript(transcript, [ + {"text": "alpha.", "start": 1.0, "end": 1.5, "type": "word"}, + ]) + out = tmp_path / "plan.json" + rec.recommend( + script_srt=srt, transcript=transcript, + source=tmp_path / "src.mp4", source_name="TAKE_A", + output_format="form-b", output=out, + ) + data = json.loads(out.read_text(encoding="utf-8")) + assert "TAKE_A" in data["sources"] + assert data["segments"][0]["source"] == "TAKE_A" + # And it's parseable by sde.parse_plan too + sources, _, entries = sde.parse_plan(out) + assert "TAKE_A" in sources + assert entries[0].source_name == "TAKE_A" + + +# --------------------------------------------------------------------------- +# 9. No candidates → hard fail (per spec) +# --------------------------------------------------------------------------- + + +def test_no_candidates_aborts(rec, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [(1, 0.0, 1.0, "alpha")]) + # Transcript has only audio_event (no words) + write_transcript(transcript, [ + {"text": "(laughter)", "start": 1.0, "end": 2.0, "type": "audio_event"}, + ]) + with pytest.raises(SystemExit): + rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + ) + + +# --------------------------------------------------------------------------- +# 10. Review markdown shows score + warnings +# --------------------------------------------------------------------------- + + +def test_review_markdown_content(rec, helpers_ns, tmp_path): + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [(1, 0.0, 2.0, "Hello world")]) + write_transcript(transcript, [ + {"text": "Hello", "start": 1.0, "end": 1.5, "type": "word"}, + {"text": "world.", "start": 1.5, "end": 2.0, "type": "word"}, + ]) + out = tmp_path / "plan.json" + rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=out, + ) + review = (out.with_name(out.stem + "_review.md")).read_text(encoding="utf-8") + assert "cue id=1" in review + assert "Hello world" in review + assert "**score**" in review + assert "**source range**" in review + + +# --------------------------------------------------------------------------- +# 11. End-to-end: recommend → run_job → final mp4 exists +# --------------------------------------------------------------------------- + + +def test_e2e_recommend_then_render( + rec, sde, helpers_ns, synth_av, tmp_path +): + """Full chain: fabricated transcript → recommend → run_job → final.mp4.""" + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + plan = tmp_path / "plan.json" + out_mp4 = tmp_path / "final.mp4" + + # 3 cues totaling 6s of output + helpers_ns.write_srt(srt, [ + (1, 0.0, 2.0, "alpha beta"), + (2, 2.0, 4.0, "gamma delta"), + (3, 4.0, 6.0, "epsilon zeta"), + ]) + # Transcript: words that match each cue at distinct, valid times in synth_av (30s) + # Each candidate is exactly 2s — matches cue duration exactly so no on-short needed. + write_transcript(transcript, [ + {"text": "alpha", "start": 1.0, "end": 1.8, "type": "word"}, + {"text": "beta.", "start": 1.8, "end": 3.0, "type": "word"}, + # silence gap + {"text": "gamma", "start": 8.0, "end": 8.8, "type": "word"}, + {"text": "delta.", "start": 8.8, "end": 10.0, "type": "word"}, + # silence gap + {"text": "epsilon", "start": 18.0, "end": 18.8, "type": "word"}, + {"text": "zeta.", "start": 18.8, "end": 20.0, "type": "word"}, + ]) + + assignments = rec.recommend( + script_srt=srt, transcript=transcript, source=synth_av, + output=plan, + ) + assert len(assignments) == 3 + assert all(a.cand is not None for a in assignments) + + # Render via the existing pipeline + ffmpeg_version = sde.preflight()["ffmpeg"] + job = sde.Job( + source=synth_av, + srt=srt, plan=plan, + voice=None, bg_volume=0.0, + tolerance=0.5, trim_direction="tail", on_short="error", + style="auto", fontsdir=None, + output=out_mp4, + name="e2e", + no_cache=False, keep_intermediates=False, no_overwrite=False, + ) + qc = sde.run_job(job, ffmpeg_version) + assert qc["ok"] is True + assert out_mp4.exists() + assert abs(qc["duration"]["drift_ms"]) <= 200 From 87439d1bf4a2acb08e5261de4e3ab0efe3788bea Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Tue, 19 May 2026 21:43:40 +0800 Subject: [PATCH 03/18] docs: add CLAUDE.md and AGENTS.md project guidance for AI assistants CLAUDE.md is auto-loaded by Claude Code when working in this directory, giving sessions a consistent picture of the project's scope, tech constraints, and out-of-bounds behaviors before the user has to say it. AGENTS.md does the same for Codex review sessions, classifying review output into must-fix / should-improve / later so suggestions are actionable rather than open-ended rewrites. Co-Authored-By: Claude Opus 4.7 --- AGENTS.md | 23 +++++++++++++++++++++++ CLAUDE.md | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 AGENTS.md create mode 100644 CLAUDE.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..7715dcb --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,23 @@ +# Agent Review Instructions + +You are reviewing a Python + ffmpeg video editing tool. + +Main goal: +Build a reliable SRT-driven video editor for Chinese drama recap videos. + +Please focus on: +- code structure +- ffmpeg stability +- SRT parsing correctness +- JSON validation +- Windows path compatibility +- Chinese subtitle rendering +- error handling +- extensibility + +Do not rewrite the entire project unless necessary. +Prefer small, safe patches. +Classify suggestions into: +1. Must fix +2. Should improve +3. Later diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..61b1e4c --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,33 @@ +# srt_video_editor 项目说明 + +本项目是电视剧解说自动剪辑工具。 + +核心目标: +根据 script.srt 和 edit_plan.json,从 source.mp4 中截取画面,拼接、加配音、烧字幕,输出 final.mp4。 + +当前阶段: +只做 SRT 驱动剪辑,不做 AI 自动理解剧情。 + +技术要求: +- Python 3.10+ +- ffmpeg +- Windows 优先 +- 路径尽量使用英文 +- 不使用 moviepy,优先直接调用 ffmpeg +- 输出日志要清楚 +- 不要引入复杂前端 + +核心输入: +- input/source.mp4 +- input/script.srt +- input/edit_plan.json +- input/voice.wav + +核心输出: +- output/final.mp4 + +禁止事项: +- 不要破解剪映 +- 不要调用未授权接口 +- 不要一次性做复杂 AI 自动分析 +- 不要改动 input 原始文件 From b7dbd6e4c52b39c69c68fd1ff3b094ab9c5eb631 Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Tue, 19 May 2026 22:36:34 +0800 Subject: [PATCH 04/18] feat(run_episodes): discover-and-run episode batches by directory layout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convention-driven multi-episode runner. Given a root containing one subdirectory per episode, discovers eps that have the required file set and runs srt_driven_edit on each. Complements the existing jobs.json / jobs.csv manifest path with a flatter, zero-config workflow. Per-episode layout (all under //): source.mp4 required script.srt required edit_plan.json required (Form A or B) voice.wav optional — wired in as the ep's global voice Outputs land at //final.mp4 with edit/ artifacts (EDL, QC, cache) inside each ep dir; an aggregate summary lands at /run_episodes_summary.json. Dirs missing required files are SKIPPED with a printed reason rather than aborting, so a partial batch is still actionable. Hard-fails only when no usable ep is found. --continue-on-error makes per-ep ffmpeg failures non-fatal too; without it, the first failure aborts the run. Process exits non-zero if any episode failed, even in continue mode. Includes 7 pytest cases: - discover skips incomplete dirs without erroring - discover picks up voice.wav when present - empty root raises - full e2e with 3 synthetic eps each producing final.mp4 - continue-on-error skips ep with out-of-bounds range, finishes others - hard abort without continue-on-error - per-ep voice.wav reflected in QC audio.mode Co-Authored-By: Claude Opus 4.7 --- helpers/run_episodes.py | 231 +++++++++++++++++++++++++++++++++++++ tests/test_run_episodes.py | 163 ++++++++++++++++++++++++++ 2 files changed, 394 insertions(+) create mode 100644 helpers/run_episodes.py create mode 100644 tests/test_run_episodes.py diff --git a/helpers/run_episodes.py b/helpers/run_episodes.py new file mode 100644 index 0000000..4ba5b1b --- /dev/null +++ b/helpers/run_episodes.py @@ -0,0 +1,231 @@ +"""Run srt_driven_edit across every episode subdirectory under a root. + +Discovery convention (flat per-episode layout): + //source.mp4 required + //script.srt required + //edit_plan.json required (Form A or B) + //voice.wav optional (global voice for this ep) + +Outputs: + //final.mp4 + //edit/... (EDL, QC report, cache — managed by srt_driven_edit) + /run_episodes_summary.json + +Usage: + python helpers/run_episodes.py batch/ + python helpers/run_episodes.py batch/ --bg-volume 0.1 --style cjk-natural + python helpers/run_episodes.py batch/ --continue-on-error +""" + +from __future__ import annotations + +import argparse +import json +import time +from dataclasses import dataclass +from pathlib import Path + +try: + from srt_driven_edit import Job, run_job, preflight, safe_ascii_name +except Exception as e: + raise SystemExit( + "run_episodes: failed to import from srt_driven_edit.py. " + f"Both files must be importable from the same helpers/ dir. ({e})" + ) + + +REQUIRED_FILES = ("source.mp4", "script.srt", "edit_plan.json") +OPTIONAL_VOICE = "voice.wav" + + +@dataclass +class EpisodeJob: + name: str + root: Path + source: Path + srt: Path + plan: Path + voice: Path | None + + +def discover_episodes(root: Path) -> list[EpisodeJob]: + """Return episode dirs under `root` that have the required file set. + + Dirs missing a required file are skipped with a printed reason — never + cause a hard failure here, so a partial batch is still actionable. + Hard-fails only if NO usable dir is found. + """ + if not root.is_dir(): + raise SystemExit(f"not a directory: {root}") + + eps: list[EpisodeJob] = [] + skipped: list[tuple[str, list[str]]] = [] + for sub in sorted(root.iterdir(), key=lambda p: p.name): + if not sub.is_dir(): + continue + missing = [f for f in REQUIRED_FILES if not (sub / f).is_file()] + if missing: + skipped.append((sub.name, missing)) + continue + voice = sub / OPTIONAL_VOICE + eps.append(EpisodeJob( + name=sub.name, + root=sub.resolve(), + source=(sub / "source.mp4").resolve(), + srt=(sub / "script.srt").resolve(), + plan=(sub / "edit_plan.json").resolve(), + voice=voice.resolve() if voice.is_file() else None, + )) + + if skipped: + print(f"skipped {len(skipped)} dir(s) missing required files:") + for name, miss in skipped: + print(f" {name}: missing {', '.join(miss)}") + if not eps: + raise SystemExit( + f"no usable episode dirs under {root}. Each ep dir needs: " + f"{list(REQUIRED_FILES)}" + ) + return eps + + +def _make_job(ep: EpisodeJob, opts: dict) -> Job: + return Job( + source=ep.source, + srt=ep.srt, + plan=ep.plan, + voice=ep.voice, + bg_volume=opts["bg_volume"], + tolerance=opts["tolerance"], + trim_direction=opts["trim_direction"], + on_short=opts["on_short"], + style=opts["style"], + fontsdir=opts["fontsdir"], + output=ep.root / "final.mp4", + name=ep.name, + no_cache=opts["no_cache"], + keep_intermediates=opts["keep_intermediates"], + no_overwrite=opts["no_overwrite"], + ) + + +def run_episodes( + root: Path, + *, + ffmpeg_version: str, + bg_volume: float = 0.0, + tolerance: float = 0.5, + trim_direction: str = "tail", + on_short: str = "error", + style: str = "auto", + fontsdir: Path | None = None, + no_cache: bool = False, + no_overwrite: bool = False, + keep_intermediates: bool = False, + continue_on_error: bool = False, +) -> dict: + """Discover + run every episode under `root`. Returns a summary dict and + also writes it to `/run_episodes_summary.json`.""" + root = root.resolve() + eps = discover_episodes(root) + print(f"\ndiscovered {len(eps)} episode(s) under {root}:") + for ep in eps: + print(f" {ep.name} voice={'yes' if ep.voice else 'no'}") + + opts = { + "bg_volume": bg_volume, + "tolerance": tolerance, + "trim_direction": trim_direction, + "on_short": on_short, + "style": style, + "fontsdir": fontsdir, + "no_cache": no_cache, + "no_overwrite": no_overwrite, + "keep_intermediates": keep_intermediates, + } + + results: list[dict] = [] + t0 = time.time() + for i, ep in enumerate(eps, start=1): + print(f"\n[{i}/{len(eps)}] === {ep.name} ===") + job = _make_job(ep, opts) + try: + qc = run_job(job, ffmpeg_version) + results.append(qc) + except SystemExit as e: + if continue_on_error: + print(f"[{i}/{len(eps)}] FAILED: {e}") + results.append({"job": ep.name, "ok": False, "error": str(e)}) + continue + raise + + ok = sum(1 for r in results if r.get("ok")) + summary = { + "root": str(root), + "episodes_total": len(eps), + "ok": ok, + "elapsed_s": round(time.time() - t0, 2), + "results": results, + } + summary_path = root / "run_episodes_summary.json" + summary_path.write_text( + json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8" + ) + print(f"\n{ok}/{len(results)} episodes ok ({summary['elapsed_s']}s)") + print(f"summary → {summary_path}") + return summary + + +def main() -> None: + ap = argparse.ArgumentParser( + description="Run srt_driven_edit across every ep*/ subdirectory.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=( + "Per-episode layout:\n" + " //source.mp4 required\n" + " //script.srt required\n" + " //edit_plan.json required (Form A or B)\n" + " //voice.wav optional\n\n" + "Outputs land at //final.mp4 with edit/ artifacts." + ), + ) + ap.add_argument("root", type=Path, + help="directory whose immediate subdirs are episodes") + ap.add_argument("--bg-volume", type=float, default=0.0) + ap.add_argument("--tolerance", type=float, default=0.5) + ap.add_argument("--trim-direction", choices=["tail", "head", "center"], default="tail") + ap.add_argument("--on-short", choices=["error", "pad"], default="error") + ap.add_argument("--style", default="auto") + ap.add_argument("--fontsdir", type=Path, default=None) + ap.add_argument("--no-cache", action="store_true") + ap.add_argument("--no-overwrite", action="store_true") + ap.add_argument("--keep-intermediates", action="store_true") + ap.add_argument("--continue-on-error", action="store_true", + help="skip episodes that fail instead of aborting") + args = ap.parse_args() + + versions = preflight() + print(f"== ffmpeg {versions['ffmpeg']} / ffprobe {versions['ffprobe']} ==") + + summary = run_episodes( + args.root, + ffmpeg_version=versions["ffmpeg"], + bg_volume=args.bg_volume, + tolerance=args.tolerance, + trim_direction=args.trim_direction, + on_short=args.on_short, + style=args.style, + fontsdir=args.fontsdir.resolve() if args.fontsdir else None, + no_cache=args.no_cache, + no_overwrite=args.no_overwrite, + keep_intermediates=args.keep_intermediates, + continue_on_error=args.continue_on_error, + ) + # Exit nonzero if any episode failed (even with --continue-on-error, + # the caller probably wants to know). + if summary["ok"] < summary["episodes_total"]: + raise SystemExit(1) + + +if __name__ == "__main__": + main() diff --git a/tests/test_run_episodes.py b/tests/test_run_episodes.py new file mode 100644 index 0000000..7003399 --- /dev/null +++ b/tests/test_run_episodes.py @@ -0,0 +1,163 @@ +"""Tests for the multi-episode batch runner.""" + +from __future__ import annotations + +import shutil +from pathlib import Path + +import pytest + + +@pytest.fixture +def runner(): + import run_episodes + return run_episodes + + +@pytest.fixture +def ffmpeg_version(helpers_ns): + return helpers_ns.sde.preflight()["ffmpeg"] + + +def _make_ep(ep_dir: Path, source: Path, helpers_ns, *, + cues=None, plan=None, voice: Path | None = None) -> None: + ep_dir.mkdir(parents=True, exist_ok=True) + shutil.copy2(source, ep_dir / "source.mp4") + helpers_ns.write_srt(ep_dir / "script.srt", cues or [ + (1, 0.0, 1.5, "alpha"), + (2, 1.5, 3.0, "beta"), + ]) + helpers_ns.write_plan_form_a(ep_dir / "edit_plan.json", plan or [ + (1, 1.0, 2.5), + (2, 5.0, 6.5), + ]) + if voice is not None: + shutil.copy2(voice, ep_dir / "voice.wav") + + +# --------------------------------------------------------------------------- +# 1. Discovery: pick up complete dirs, skip incomplete ones +# --------------------------------------------------------------------------- + + +def test_discover_skips_incomplete_dirs(runner, helpers_ns, synth_av, tmp_path): + batch = tmp_path / "batch" + _make_ep(batch / "ep01", synth_av, helpers_ns) + _make_ep(batch / "ep02", synth_av, helpers_ns) + # incomplete: missing edit_plan.json + bad = batch / "ep03" + bad.mkdir(parents=True) + shutil.copy2(synth_av, bad / "source.mp4") + helpers_ns.write_srt(bad / "script.srt", [(1, 0.0, 1.5, "x")]) + # not a dir at all + (batch / "stray.txt").write_text("ignore me", encoding="utf-8") + + eps = runner.discover_episodes(batch) + names = [e.name for e in eps] + assert names == ["ep01", "ep02"] + + +def test_discover_sees_voice_wav_if_present( + runner, helpers_ns, synth_av, synth_voice, tmp_path +): + batch = tmp_path / "batch" + _make_ep(batch / "ep01", synth_av, helpers_ns) + _make_ep(batch / "ep02", synth_av, helpers_ns, voice=synth_voice) + + eps = runner.discover_episodes(batch) + by_name = {e.name: e for e in eps} + assert by_name["ep01"].voice is None + assert by_name["ep02"].voice is not None and by_name["ep02"].voice.is_file() + + +def test_discover_hard_fails_on_empty_root(runner, tmp_path): + batch = tmp_path / "empty" + batch.mkdir() + with pytest.raises(SystemExit) as exc: + runner.discover_episodes(batch) + assert "no usable" in str(exc.value) + + +# --------------------------------------------------------------------------- +# 2. End-to-end: 3 eps run sequentially, each produces final.mp4 +# --------------------------------------------------------------------------- + + +def test_run_episodes_e2e(runner, helpers_ns, ffmpeg_version, synth_av, tmp_path): + batch = tmp_path / "batch" + for name in ("ep01", "ep02", "ep03"): + _make_ep(batch / name, synth_av, helpers_ns) + + summary = runner.run_episodes(batch, ffmpeg_version=ffmpeg_version) + + assert summary["episodes_total"] == 3 + assert summary["ok"] == 3 + for name in ("ep01", "ep02", "ep03"): + final = batch / name / "final.mp4" + assert final.exists(), f"{name}/final.mp4 missing" + + # Summary artifact + summary_file = batch / "run_episodes_summary.json" + assert summary_file.exists() + + +# --------------------------------------------------------------------------- +# 3. continue-on-error skips a broken ep, finishes the rest +# --------------------------------------------------------------------------- + + +def test_run_episodes_continue_on_error( + runner, helpers_ns, ffmpeg_version, synth_av, tmp_path +): + batch = tmp_path / "batch" + _make_ep(batch / "ep01", synth_av, helpers_ns) + # ep02: range exceeds the synth source (30s) — pre-extract range check fires + _make_ep(batch / "ep02", synth_av, helpers_ns, + plan=[(1, 1.0, 2.5), (2, 60.0, 61.5)]) + _make_ep(batch / "ep03", synth_av, helpers_ns) + + summary = runner.run_episodes( + batch, ffmpeg_version=ffmpeg_version, + continue_on_error=True, + ) + assert summary["episodes_total"] == 3 + assert summary["ok"] == 2 + # ep01 + ep03 produced output, ep02 did not + assert (batch / "ep01" / "final.mp4").exists() + assert not (batch / "ep02" / "final.mp4").exists() + assert (batch / "ep03" / "final.mp4").exists() + + +def test_run_episodes_aborts_without_continue_on_error( + runner, helpers_ns, ffmpeg_version, synth_av, tmp_path +): + batch = tmp_path / "batch" + _make_ep(batch / "ep01", synth_av, helpers_ns) + _make_ep(batch / "ep02", synth_av, helpers_ns, + plan=[(1, 1.0, 2.5), (2, 60.0, 61.5)]) # bad + _make_ep(batch / "ep03", synth_av, helpers_ns) + + with pytest.raises(SystemExit): + runner.run_episodes(batch, ffmpeg_version=ffmpeg_version) + # ep03 was never reached + assert not (batch / "ep03" / "final.mp4").exists() + + +# --------------------------------------------------------------------------- +# 4. Per-ep voice.wav becomes a global voice for that ep +# --------------------------------------------------------------------------- + + +def test_run_episodes_per_ep_voice( + runner, helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path +): + batch = tmp_path / "batch" + _make_ep(batch / "ep01", synth_av, helpers_ns) + _make_ep(batch / "ep02", synth_av, helpers_ns, voice=synth_voice) + + summary = runner.run_episodes(batch, ffmpeg_version=ffmpeg_version) + + by_name = {r["job"]: r for r in summary["results"]} + assert by_name["ep01"]["audio"]["voice_used"] is False + assert by_name["ep02"]["audio"]["voice_used"] is True + assert by_name["ep02"]["audio"]["mode"] == "voice_replace" From e0be38f121f2cc45915937331eeefd94f7690c94 Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Tue, 19 May 2026 22:57:35 +0800 Subject: [PATCH 05/18] feat(batch): enrich failure records with index, paths, and stderr tail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Until now a failed batch row only recorded {job, ok: False, error}. To diagnose an ffmpeg crash you had to scroll the terminal back; for a malformed manifest row you had no idea which row index errored. This commit adds a structured diagnostic payload to every failure entry in both the srt_driven_edit batch path and the run_episodes path. New shape per failed entry: {job, ok: False, index, error, srt, plan, source, output, stderr_tail} - `index` is the 0-based position in the manifest / discovered ep list, so the summary trivially round-trips back to the bad row. - `srt` / `plan` / `source` / `output` come from the resolved Job when available; for rows that crash inside job_from_dict (no Job yet), they fall back to the raw manifest_row dict so context is never lost. - `stderr_tail` is the last 30 lines / 2 KB of ffmpeg's stderr, populated only when the failure originated in run_ff. Pre-flight / validation errors leave it empty by design. To carry the stderr tail without breaking the existing `except SystemExit:` pattern, add a `PipelineError(SystemExit)` subclass with a `.stderr_tail` attribute, raised by `run_ff` on non-zero exit. Existing handlers continue to work via `getattr(e, "stderr_tail", "")`. The new helper `make_failure_record(...)` is exported from srt_driven_edit and reused by both the CLI's batch loop and run_episodes.run_episodes so the two paths stay in sync. Tests added (4): - test_run_ff_raises_pipeline_error_with_stderr — direct unit test of PipelineError carrying real ffmpeg stderr - test_batch_failure_record_includes_paths — out-of-bounds range fails pre-extract; record carries index/srt/plan/source/output, empty stderr_tail - test_batch_malformed_row_failure_record — row missing 'plan' still yields a usable record sourced from the raw manifest row - test_run_episodes_failure_record_includes_paths — same for the directory-based runner Co-Authored-By: Claude Opus 4.7 --- helpers/run_episodes.py | 15 ++++-- helpers/srt_driven_edit.py | 94 ++++++++++++++++++++++++++++++++-- tests/test_run_episodes.py | 29 +++++++++++ tests/test_srt_driven_batch.py | 93 ++++++++++++++++++++++++++++++--- 4 files changed, 217 insertions(+), 14 deletions(-) diff --git a/helpers/run_episodes.py b/helpers/run_episodes.py index 4ba5b1b..847f406 100644 --- a/helpers/run_episodes.py +++ b/helpers/run_episodes.py @@ -26,7 +26,10 @@ from pathlib import Path try: - from srt_driven_edit import Job, run_job, preflight, safe_ascii_name + from srt_driven_edit import ( + Job, run_job, preflight, safe_ascii_name, + make_failure_record, + ) except Exception as e: raise SystemExit( "run_episodes: failed to import from srt_driven_edit.py. " @@ -146,16 +149,18 @@ def run_episodes( results: list[dict] = [] t0 = time.time() - for i, ep in enumerate(eps, start=1): - print(f"\n[{i}/{len(eps)}] === {ep.name} ===") + for i, ep in enumerate(eps): + print(f"\n[{i + 1}/{len(eps)}] === {ep.name} ===") job = _make_job(ep, opts) try: qc = run_job(job, ffmpeg_version) results.append(qc) except SystemExit as e: if continue_on_error: - print(f"[{i}/{len(eps)}] FAILED: {e}") - results.append({"job": ep.name, "ok": False, "error": str(e)}) + print(f"[{i + 1}/{len(eps)}] FAILED: {e}") + results.append(make_failure_record( + index=i, name=ep.name, error=e, job=job, + )) continue raise diff --git a/helpers/srt_driven_edit.py b/helpers/srt_driven_edit.py index 3cfa052..ac9d2f5 100644 --- a/helpers/srt_driven_edit.py +++ b/helpers/srt_driven_edit.py @@ -716,13 +716,48 @@ def cache_store(cache_dir: Path, key: str, clip_path: Path) -> None: # ============================================================================ +class PipelineError(SystemExit): + """SystemExit subclass carrying ffmpeg stderr context for diagnostics. + + Batch loops pattern-match on `stderr_tail` to write a richer failure + record. Plain SystemExit raised by pre-flight / validation code keeps + working — callers use `getattr(e, 'stderr_tail', '')` so both branches + of `try/except SystemExit` flow through the same handler. + """ + def __init__(self, message: str, *, stderr_tail: str = ""): + super().__init__(message) + self.stderr_tail = stderr_tail + + +def _tail_text(s: str, *, max_lines: int = 30, max_chars: int = 2000) -> str: + """Return the last `max_lines` of `s`, capped at `max_chars`. + + Used to attach a readable slice of ffmpeg's stderr to PipelineError — + enough to diagnose, not so much that batch summaries balloon. + """ + if not s: + return "" + lines = s.strip().splitlines() + tail = "\n".join(lines[-max_lines:]) + if len(tail) > max_chars: + tail = "...[truncated]...\n" + tail[-(max_chars - 22):] + return tail + + def run_ff(cmd: list[str], desc: str) -> None: print(f" $ {desc}") proc = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", errors="replace") if proc.returncode != 0: + # Stream raw stderr to the console so an interactive user sees the + # failure live; also attach a bounded tail to the exception so a + # batch summary can capture diagnostic context without keeping the + # full stderr in memory or in the JSON. sys.stderr.write(proc.stderr or "") - raise SystemExit(f"ffmpeg failed: {desc}") + raise PipelineError( + f"ffmpeg failed: {desc}", + stderr_tail=_tail_text(proc.stderr or ""), + ) def probe_duration(path: Path) -> float: @@ -1346,6 +1381,54 @@ def run_job(job: Job, ffmpeg_version: str) -> dict: # ============================================================================ +def make_failure_record( + *, + index: int, + name: str, + error: BaseException, + job: "Job | None" = None, + manifest_row: dict | None = None, +) -> dict: + """Build a diagnostic failure entry for a batch summary. + + Shape: `{job, ok=False, index, error, stderr_tail, srt, plan, source, output}`. + + `stderr_tail` is non-empty only for `PipelineError` (i.e. ffmpeg failures); + plain `SystemExit` from validation paths leaves it as "". When `job` is + provided, paths come from the resolved Job; otherwise they fall back to + the raw manifest_row dict so rows that crash inside `job_from_dict` + still get useful context. + """ + stderr_tail = "" + if isinstance(error, PipelineError): + stderr_tail = error.stderr_tail or "" + + if job is not None: + srt = str(job.srt) if job.srt else None + plan = str(job.plan) if job.plan else None + source = str(job.source) if job.source else None + output = str(job.output) if job.output else None + elif manifest_row is not None: + srt = manifest_row.get("srt") + plan = manifest_row.get("plan") + source = manifest_row.get("source") + output = manifest_row.get("output") + else: + srt = plan = source = output = None + + return { + "job": name, + "ok": False, + "index": index, + "error": str(error), + "stderr_tail": stderr_tail, + "srt": srt, + "plan": plan, + "source": source, + "output": output, + } + + def load_manifest(path: Path) -> list[dict]: suffix = path.suffix.lower() if suffix == ".json": @@ -1473,7 +1556,10 @@ def main() -> None: except SystemExit as e: if args.continue_on_error: print(f"[batch {i}] skipped: {e}") - results.append({"job": row.get("name", f"row{i}"), "ok": False, "error": str(e)}) + results.append(make_failure_record( + index=i, name=row.get("name", f"row{i}"), + error=e, job=None, manifest_row=row, + )) continue raise try: @@ -1481,7 +1567,9 @@ def main() -> None: except SystemExit as e: if args.continue_on_error: print(f"[batch {i}] FAILED: {e}") - results.append({"job": job.name, "ok": False, "error": str(e)}) + results.append(make_failure_record( + index=i, name=job.name, error=e, job=job, + )) continue raise summary_path = manifest_path.with_name(manifest_path.stem + "_qc_summary.json") diff --git a/tests/test_run_episodes.py b/tests/test_run_episodes.py index 7003399..754ce42 100644 --- a/tests/test_run_episodes.py +++ b/tests/test_run_episodes.py @@ -148,6 +148,35 @@ def test_run_episodes_aborts_without_continue_on_error( # --------------------------------------------------------------------------- +def test_run_episodes_failure_record_includes_paths( + runner, helpers_ns, ffmpeg_version, synth_av, tmp_path +): + """When --continue-on-error skips an ep, the record must carry enough + context to triage without re-reading the terminal.""" + batch = tmp_path / "batch" + _make_ep(batch / "ep01", synth_av, helpers_ns) + _make_ep(batch / "ep02", synth_av, helpers_ns, + plan=[(1, 1.0, 2.5), (2, 60.0, 61.5)]) # range overruns 30s synth + _make_ep(batch / "ep03", synth_av, helpers_ns) + + summary = runner.run_episodes( + batch, ffmpeg_version=ffmpeg_version, + continue_on_error=True, + ) + failed = [r for r in summary["results"] if not r.get("ok")] + assert len(failed) == 1 + rec = failed[0] + assert rec["job"] == "ep02" + assert rec["index"] == 1 + assert rec["srt"].endswith("script.srt") + assert rec["plan"].endswith("edit_plan.json") + assert rec["source"].endswith("source.mp4") + assert rec["output"].endswith("final.mp4") + assert rec["error"] + # Pre-extract range-bounds check → no ffmpeg → empty stderr + assert rec["stderr_tail"] == "" + + def test_run_episodes_per_ep_voice( runner, helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path ): diff --git a/tests/test_srt_driven_batch.py b/tests/test_srt_driven_batch.py index 47ad264..8a41c90 100644 --- a/tests/test_srt_driven_batch.py +++ b/tests/test_srt_driven_batch.py @@ -54,18 +54,19 @@ def run_batch(helpers_ns, manifest_path, ffmpeg_version, *, job = sde.job_from_dict(row, defaults, manifest_path.parent, i) except SystemExit as e: if continue_on_error: - results.append({ - "job": row.get("name", f"row{i}"), - "ok": False, - "error": str(e), - }) + results.append(sde.make_failure_record( + index=i, name=row.get("name", f"row{i}"), + error=e, job=None, manifest_row=row, + )) continue raise try: results.append(sde.run_job(job, ffmpeg_version)) except SystemExit as e: if continue_on_error: - results.append({"job": job.name, "ok": False, "error": str(e)}) + results.append(sde.make_failure_record( + index=i, name=job.name, error=e, job=job, + )) continue raise return results @@ -187,6 +188,86 @@ def test_batch_csv_manifest(helpers_ns, ffmpeg_version, synth_av, tmp_path): # --------------------------------------------------------------------------- +def test_run_ff_raises_pipeline_error_with_stderr(helpers_ns, tmp_path): + """run_ff must raise PipelineError carrying a non-empty stderr tail.""" + sde = helpers_ns.sde + out = tmp_path / "out.mp4" + bogus = tmp_path / "definitely_missing.mp4" + with pytest.raises(sde.PipelineError) as exc: + sde.run_ff( + ["ffmpeg", "-y", "-hide_banner", "-i", str(bogus), str(out)], + "intentional failure", + ) + # Subclass of SystemExit → existing handlers keep working + assert isinstance(exc.value, SystemExit) + assert exc.value.stderr_tail, "stderr_tail should be populated on ffmpeg failure" + # The stderr from ffmpeg complaining about a missing input should mention it + assert "definitely_missing.mp4" in exc.value.stderr_tail \ + or "No such file" in exc.value.stderr_tail + + +def test_batch_failure_record_includes_paths( + helpers_ns, ffmpeg_version, synth_av, tmp_path +): + """A failed batch row must carry index/srt/plan/source/output for triage.""" + helpers_ns.write_srt(tmp_path / "s_ok.srt", CUES_2) + helpers_ns.write_plan_form_a(tmp_path / "p_ok.json", PLAN_2) + helpers_ns.write_srt(tmp_path / "s_bad.srt", CUES_2) + # out-of-bounds range (synth_av is 30s; 60s exceeds it) — fails in pre-flight, + # no ffmpeg invocation → stderr_tail should stay empty. + helpers_ns.write_plan_form_a(tmp_path / "p_bad.json", + [(1, 1.0, 3.0), (2, 60.0, 62.0)]) + + manifest_path = tmp_path / "jobs.json" + manifest_path.write_text(json.dumps([ + {"name": "ok", "source": str(synth_av), + "srt": "s_ok.srt", "plan": "p_ok.json"}, + {"name": "bad", "source": str(synth_av), + "srt": "s_bad.srt", "plan": "p_bad.json"}, + ]), encoding="utf-8") + + results = run_batch(helpers_ns, manifest_path, ffmpeg_version, + continue_on_error=True) + assert len(results) == 2 and results[0]["ok"] is True + failed = results[1] + assert failed["ok"] is False + assert failed["job"] == "bad" + assert failed["index"] == 1 + assert failed["srt"] and failed["srt"].endswith("s_bad.srt") + assert failed["plan"] and failed["plan"].endswith("p_bad.json") + assert failed["source"] == str(synth_av) + assert failed["output"] and failed["output"].endswith(".mp4") + assert failed["error"] + # Range-bounds check fires before any ffmpeg → no stderr + assert failed["stderr_tail"] == "" + + +def test_batch_malformed_row_failure_record(helpers_ns, ffmpeg_version, tmp_path): + """A row that fails inside job_from_dict still gets a usable record. + + No Job was ever constructed, so paths come from the raw manifest row. + """ + manifest_path = tmp_path / "jobs.json" + manifest_path.write_text(json.dumps([ + {"name": "broken", + "source": "raw/take.mp4", + "srt": "scripts/missing.srt"}, # no `plan` field + ]), encoding="utf-8") + + results = run_batch(helpers_ns, manifest_path, ffmpeg_version, + continue_on_error=True) + assert len(results) == 1 + failed = results[0] + assert failed["ok"] is False + assert failed["job"] == "broken" + assert failed["index"] == 0 + # Source / SRT come from the row dict because Job construction never completed + assert failed["source"] == "raw/take.mp4" + assert failed["srt"] == "scripts/missing.srt" + assert failed["plan"] is None + assert failed["stderr_tail"] == "" + + def test_batch_per_job_bg_volume(helpers_ns, ffmpeg_version, synth_av, tmp_path): helpers_ns.write_srt(tmp_path / "s.srt", CUES_2) helpers_ns.write_plan_form_a(tmp_path / "p.json", PLAN_2) From 7059f34a7e137d57f3998a0373d2dae3c30cbae2 Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Tue, 19 May 2026 23:02:37 +0800 Subject: [PATCH 06/18] =?UTF-8?q?feat:=20add=20main.py=20=E2=80=94=20proje?= =?UTF-8?q?ct-root=20entry=20point=20with=20input/output=20defaults?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thin wrapper over helpers/srt_driven_edit.py that fills in the layout described in CLAUDE.md: input/source.mp4 + input/script.srt + input/edit_plan.json --(python main.py)--> output/final.mp4 Behavior: - `--srt`, `--plan`, and `-o` defaults are injected when the user did not supply them; output/ is auto-created. - `--source` and `--voice` defaults are injected only when the corresponding file actually exists under input/, so Form B users without input/source.mp4 do not get a misleading "missing on disk" error from a defaulted flag they never wanted. - Both bare (`--srt foo`) and equals (`--srt=foo`) forms count as user-supplied; no double-injection. - `--batch ` short-circuits all single-job defaults so the manifest fully owns its paths. The wrapper performs argv rewriting then forwards to srt_driven_edit.main(), so every existing flag (style, bg-volume, no-overwrite, continue-on-error, etc.) keeps working unchanged. 7 unit tests cover: bare defaults, source/voice file-gated injection, user-flag precedence, equals-form recognition, batch short-circuit, and the short -o alias. Co-Authored-By: Claude Opus 4.7 --- main.py | 86 ++++++++++++++++++++++++++++++++ tests/test_main_entry.py | 104 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 190 insertions(+) create mode 100644 main.py create mode 100644 tests/test_main_entry.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..3e43b97 --- /dev/null +++ b/main.py @@ -0,0 +1,86 @@ +"""Project-root entry point for the SRT-driven editor. + +A thin wrapper over `helpers/srt_driven_edit.py` that fills in the +`input/` -> `output/` layout described in CLAUDE.md so the common case +collapses to a single command: + + python main.py + +The wrapper injects these defaults only when the corresponding flag is +absent from `sys.argv`: + + --srt input/script.srt (always; required by srt_driven_edit) + --plan input/edit_plan.json (always; required by srt_driven_edit) + --source input/source.mp4 (only if the file exists) + --voice input/voice.wav (only if the file exists) + -o output/final.mp4 (always; output/ is auto-created) + +Anything you pass explicitly wins. Batch mode (`--batch `) skips +all single-job defaults so the manifest fully owns its own paths. + +Examples: + python main.py + python main.py --bg-volume 0.1 --style cjk-natural + python main.py --plan plans/custom.json -o out/custom.mp4 + python main.py --batch jobs.json --continue-on-error +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +# Wire helpers/ onto sys.path so `from srt_driven_edit import ...` works +# regardless of the user's cwd. +ROOT = Path(__file__).resolve().parent +sys.path.insert(0, str(ROOT / "helpers")) + +from srt_driven_edit import main as _srt_driven_main # noqa: E402 + + +def _has_flag(args: list[str], *flags: str) -> bool: + """True if any of `flags` appears in `args`, in either bare or `=` form.""" + for token in args: + for f in flags: + if token == f or token.startswith(f + "="): + return True + return False + + +def _inject_defaults(args: list[str]) -> list[str]: + """Add input/ -> output/ defaults for the flags the user did not provide.""" + out = list(args) + + # Batch mode owns its own paths via the manifest — never inject. + if _has_flag(out, "--batch"): + return out + + if not _has_flag(out, "--srt"): + out += ["--srt", "input/script.srt"] + if not _has_flag(out, "--plan"): + out += ["--plan", "input/edit_plan.json"] + + # --source is required for Form A plans but ignored for Form B. Inject + # only when the file is actually present so Form B users with no + # input/source.mp4 don't get a misleading "missing on disk" error. + if not _has_flag(out, "--source") and (ROOT / "input/source.mp4").exists(): + out += ["--source", "input/source.mp4"] + + # Same idea for voice: it's always optional, so inject only when present. + if not _has_flag(out, "--voice") and (ROOT / "input/voice.wav").exists(): + out += ["--voice", "input/voice.wav"] + + if not _has_flag(out, "-o", "--output"): + (ROOT / "output").mkdir(exist_ok=True) + out += ["-o", "output/final.mp4"] + + return out + + +def main() -> None: + sys.argv = [sys.argv[0]] + _inject_defaults(sys.argv[1:]) + _srt_driven_main() + + +if __name__ == "__main__": + main() diff --git a/tests/test_main_entry.py b/tests/test_main_entry.py new file mode 100644 index 0000000..579cb54 --- /dev/null +++ b/tests/test_main_entry.py @@ -0,0 +1,104 @@ +"""Tests for the project-root main.py wrapper. + +Only the default-injection logic is unit-tested here; the actual run_job +path is exercised by tests/test_srt_driven_*. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT)) + + +@pytest.fixture +def main_mod(monkeypatch, tmp_path): + """Fresh import of main.py rooted at a tmp dir so ROOT/input doesn't leak.""" + monkeypatch.chdir(tmp_path) + # Force-reload main with a new ROOT pointing at tmp_path so file-existence + # checks reflect what the test wrote, not what's actually in the repo root. + import importlib + import main as _m + importlib.reload(_m) + _m.ROOT = tmp_path # rebind so input/source.mp4 etc. resolve in tmp + return _m + + +def test_defaults_when_no_flags(main_mod, tmp_path): + """No flags + nothing in input/ → srt/plan/output defaults, no source/voice.""" + out = main_mod._inject_defaults([]) + assert "--srt" in out and "input/script.srt" in out + assert "--plan" in out and "input/edit_plan.json" in out + assert "-o" in out and "output/final.mp4" in out + # input/source.mp4 doesn't exist → --source NOT injected + assert "--source" not in out + assert "--voice" not in out + # output/ dir was created + assert (tmp_path / "output").is_dir() + + +def test_injects_source_when_present(main_mod, tmp_path): + (tmp_path / "input").mkdir() + (tmp_path / "input" / "source.mp4").write_bytes(b"x") + out = main_mod._inject_defaults([]) + assert "--source" in out and "input/source.mp4" in out + + +def test_injects_voice_when_present(main_mod, tmp_path): + (tmp_path / "input").mkdir() + (tmp_path / "input" / "voice.wav").write_bytes(b"x") + out = main_mod._inject_defaults([]) + assert "--voice" in out and "input/voice.wav" in out + + +def test_user_flags_win(main_mod, tmp_path): + (tmp_path / "input").mkdir() + (tmp_path / "input" / "source.mp4").write_bytes(b"x") + user = ["--srt", "scripts/ep01.srt", + "--plan", "plans/ep01.json", + "--source", "raw/ep01.mp4", + "-o", "out/ep01.mp4"] + out = main_mod._inject_defaults(user) + # User-supplied wins; no duplicate defaults appended + assert out.count("--srt") == 1 and "scripts/ep01.srt" in out + assert out.count("--plan") == 1 and "plans/ep01.json" in out + assert out.count("--source") == 1 and "raw/ep01.mp4" in out + assert out.count("-o") == 1 and "out/ep01.mp4" in out + # Default input/script.srt etc. NOT injected + assert "input/script.srt" not in out + assert "input/edit_plan.json" not in out + + +def test_equals_form_recognized(main_mod, tmp_path): + """--flag=value form must count as 'flag is set' so we don't double-inject.""" + out = main_mod._inject_defaults(["--srt=scripts/x.srt", "--plan=plans/x.json"]) + # Defaults must NOT be appended. Both the user's tokens and any default + # bare `--srt` / `--plan` would otherwise coexist. + assert "--srt=scripts/x.srt" in out + assert "--plan=plans/x.json" in out + assert "--srt" not in out # no bare default flag + assert "--plan" not in out + assert "input/script.srt" not in out + assert "input/edit_plan.json" not in out + + +def test_batch_mode_skips_all_defaults(main_mod, tmp_path): + (tmp_path / "input").mkdir() + (tmp_path / "input" / "source.mp4").write_bytes(b"x") + out = main_mod._inject_defaults(["--batch", "jobs.json"]) + # No single-job defaults — manifest owns paths. + assert "--srt" not in out + assert "--plan" not in out + assert "--source" not in out + assert "-o" not in out + assert "--output" not in out + + +def test_short_output_flag_recognized(main_mod, tmp_path): + out = main_mod._inject_defaults(["-o", "custom/path.mp4"]) + assert out.count("-o") == 1 + assert "output/final.mp4" not in out From f88b58dd805ffbf5a84dadcb6877ae80d25be30f Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Tue, 19 May 2026 23:09:06 +0800 Subject: [PATCH 07/18] =?UTF-8?q?feat:=20add=20srt=5Fvideo=5Feditor.py=20?= =?UTF-8?q?=E2=80=94=20minimal=20viable,=20no-ffmpeg=20edition?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A standalone, self-contained scaffold for the SRT-driven editor that deliberately does not touch video. Reads script.srt + edit_plan.json (Form A only), validates id matching with clear error messages, and prints each cue's planned source-time range alongside the cue's output range and a text preview. Why a separate, smaller file when helpers/srt_driven_edit.py already exists: this version is meant to be read top-to-bottom in one sitting. It has zero imports from helpers/, no dependency on ffmpeg, and ~150 lines including comments and blank space. It is the natural starting point for someone learning the pipeline before the production code. Scope strictly per spec: - parse SRT (utf-8-sig, CRLF, cue settings tolerated) - parse plan (Form A only — Form B is explicitly rejected with a pointer to the full pipeline) - validate id sets match, with duplicate detection on both sides - print the cue/source-range table Deliberately NOT implemented: - ffmpeg invocation - EDL or QC artifact emission - Form B sources / voices maps - global voice mixing - subtitle burn - cache, batch, run_episodes integration Defaults to input/script.srt and input/edit_plan.json so the canonical project layout from CLAUDE.md works with no flags. Co-Authored-By: Claude Opus 4.7 --- srt_video_editor.py | 174 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 srt_video_editor.py diff --git a/srt_video_editor.py b/srt_video_editor.py new file mode 100644 index 0000000..4dc4f07 --- /dev/null +++ b/srt_video_editor.py @@ -0,0 +1,174 @@ +"""srt_video_editor — minimal viable version. + +Reads script.srt + edit_plan.json, validates that their ids match, and +prints the planned source-time range for each subtitle cue. Does NOT +touch video — this is the starting scaffold before any ffmpeg work. + +Self-contained on purpose: no imports from helpers/ so the whole flow +fits in one readable file. + +Usage: + python srt_video_editor.py + python srt_video_editor.py --srt input/script.srt --plan input/edit_plan.json +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path + + +# ---------- timestamp helpers ---------- + +_TS_RE = re.compile(r"(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})") + + +def parse_ts(s: str) -> float: + """Parse 'HH:MM:SS,ms' or 'HH:MM:SS.ms' to seconds.""" + m = _TS_RE.fullmatch(s.strip()) + if not m: + raise ValueError(f"bad timestamp: {s!r}") + h, mn, sec, ms = m.groups() + return int(h) * 3600 + int(mn) * 60 + int(sec) + int(ms.ljust(3, "0")) / 1000.0 + + +def format_ts(seconds: float) -> str: + total_ms = int(round(seconds * 1000)) + h, rem = divmod(total_ms, 3600_000) + m, rem = divmod(rem, 60_000) + s, ms = divmod(rem, 1000) + return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" + + +# ---------- parsers ---------- + + +def parse_srt(path: Path) -> list[dict]: + """Return a list of {id, start, end, text} in file order. + + Tolerates UTF-8 with or without BOM, CRLF / LF line endings, and + SRT cue settings ('position:90% align:start') trailing the time line. + """ + raw = path.read_text(encoding="utf-8-sig") + cues: list[dict] = [] + for block in re.split(r"\r?\n\r?\n+", raw.strip()): + lines = [ln for ln in block.splitlines() if ln.strip()] + if len(lines) < 2: + continue + try: + cid = int(lines[0].strip()) + except ValueError: + raise SystemExit(f"SRT id line is not an integer: {lines[0]!r}") + if "-->" not in lines[1]: + raise SystemExit(f"SRT block missing '-->' time line: {lines[1]!r}") + left, right = lines[1].split("-->", 1) + start = parse_ts(left.strip().split()[-1]) + end = parse_ts(right.strip().split()[0]) + text = "\n".join(lines[2:]) + cues.append({"id": cid, "start": start, "end": end, "text": text}) + if not cues: + raise SystemExit(f"SRT has no cues: {path}") + return cues + + +def parse_plan(path: Path) -> list[dict]: + """Return a list of {id, source_start, source_end}. Only Form A is + accepted here (a flat JSON array); Form B is out of scope for the + minimal version.""" + data = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(data, list): + raise SystemExit( + "edit_plan.json must be a JSON array of " + "{id, source_start, source_end} objects (Form A)." + ) + out: list[dict] = [] + for row in data: + try: + out.append({ + "id": int(row["id"]), + "source_start": parse_ts(row["source_start"]), + "source_end": parse_ts(row["source_end"]), + }) + except (KeyError, ValueError) as e: + raise SystemExit(f"plan row {row!r}: {e}") + return out + + +# ---------- validation ---------- + + +def validate_ids(cues: list[dict], plan: list[dict]) -> None: + """Each id must appear exactly once in both sides, and the two id sets + must be equal. Any deviation is a hard failure with a clear message. + """ + cue_ids = [c["id"] for c in cues] + plan_ids = [p["id"] for p in plan] + + dup_cue = {i for i in cue_ids if cue_ids.count(i) > 1} + if dup_cue: + raise SystemExit(f"SRT has duplicate ids: {sorted(dup_cue)}") + dup_plan = {i for i in plan_ids if plan_ids.count(i) > 1} + if dup_plan: + raise SystemExit(f"edit_plan has duplicate ids: {sorted(dup_plan)}") + + only_srt = set(cue_ids) - set(plan_ids) + only_plan = set(plan_ids) - set(cue_ids) + if only_srt or only_plan: + msg = [] + if only_srt: + msg.append(f"in SRT but missing in plan: {sorted(only_srt)}") + if only_plan: + msg.append(f"in plan but missing in SRT: {sorted(only_plan)}") + raise SystemExit("id mismatch: " + "; ".join(msg)) + + +# ---------- report ---------- + + +def print_report(cues: list[dict], plan: list[dict]) -> None: + plan_by_id = {p["id"]: p for p in plan} + print(f"{len(cues)} cue(s), all ids matched.") + print() + header = f" {'ID':>3} {'OUTPUT (cue)':<23} {'SOURCE (planned)':<23} TEXT" + print(header) + print(f" {'-' * 3} {'-' * 23} {'-' * 23} {'-' * 4}") + for cue in sorted(cues, key=lambda c: c["id"]): + p = plan_by_id[cue["id"]] + out_range = f"{format_ts(cue['start'])} -> {format_ts(cue['end'])}" + src_range = f"{format_ts(p['source_start'])} -> {format_ts(p['source_end'])}" + preview = cue["text"].replace("\n", " ") + if len(preview) > 50: + preview = preview[:47] + "..." + print(f" {cue['id']:>3} {out_range:<23} {src_range:<23} {preview}") + + +# ---------- entry ---------- + + +def main() -> None: + ap = argparse.ArgumentParser( + description=( + "Minimal SRT-driven editor. Reads script.srt + edit_plan.json, " + "validates id matching, prints the planned source range for each " + "cue. No ffmpeg, no actual cutting." + ), + ) + ap.add_argument("--srt", type=Path, default=Path("input/script.srt")) + ap.add_argument("--plan", type=Path, default=Path("input/edit_plan.json")) + args = ap.parse_args() + + for p in (args.srt, args.plan): + if not p.is_file(): + raise SystemExit(f"file not found: {p}") + + cues = parse_srt(args.srt) + plan = parse_plan(args.plan) + validate_ids(cues, plan) + print_report(cues, plan) + + +if __name__ == "__main__": + main() From 09783f5efcb06e4730c62adf3aedfe1d9124937b Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Tue, 19 May 2026 23:16:09 +0800 Subject: [PATCH 08/18] feat(srt_video_editor): cut clips per cue into temp/ with ffmpeg Extends the minimal entry point with the first ffmpeg pass. The existing SRT parsing, plan parsing, and id validation are unchanged; print_report still runs before extraction so you see the planned mapping before the cutter touches the disk. Adds two functions: cut_clip(source, start, end, out) -ss before -i + libx264 re-encode (frame-accurate). Keeps the original audio via -c:a aac. Raises SystemExit with the full command and the complete ffmpeg stderr on non-zero exit; raises a friendly "ffmpeg not on PATH" message instead of FileNotFoundError when the binary is missing. extract_clips(cues, plan, source, temp_dir) Iterates cues in id order, computes source_end - source_start, hard-fails with the offending id on duration <= 0, prints `id / start / end / out_path` per clip before invoking the cutter, and writes to `/clip_.mp4`. Filenames are keyed by cue id (not enumerate) so a clip is traceable to its cue at a glance even with sparse ids. Two new CLI flags: --source defaults to input/source.mp4 --temp-dir defaults to temp/ (auto-created) Deliberately NOT done (still out of scope for the minimal scaffold): concatenation, audio fades, sync tails, HDR tone-map, subtitle burn, EDL artifact, QC report. Those live in helpers/srt_driven_edit.py. Smoke-verified end-to-end: - 3-cue happy path: 3 clip_NNN.mp4 files emitted - bad plan (source_end < source_start on id=2): clip 1 cuts, run aborts with `plan id=2: source_end ... <= source_start ...` Co-Authored-By: Claude Opus 4.7 --- srt_video_editor.py | 107 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 98 insertions(+), 9 deletions(-) diff --git a/srt_video_editor.py b/srt_video_editor.py index 4dc4f07..a3586c8 100644 --- a/srt_video_editor.py +++ b/srt_video_editor.py @@ -1,15 +1,17 @@ """srt_video_editor — minimal viable version. -Reads script.srt + edit_plan.json, validates that their ids match, and -prints the planned source-time range for each subtitle cue. Does NOT -touch video — this is the starting scaffold before any ffmpeg work. +Reads script.srt + edit_plan.json, validates that their ids match, +prints the planned source-time range for each cue, then cuts each cue's +range out of source.mp4 into temp/clip_.mp4 with ffmpeg. -Self-contained on purpose: no imports from helpers/ so the whole flow -fits in one readable file. +Stays self-contained on purpose: no imports from helpers/, so the whole +flow fits in one readable file. Concatenation, audio fades, subtitle +burn, etc. are NOT done here — they live in helpers/srt_driven_edit.py. Usage: python srt_video_editor.py - python srt_video_editor.py --srt input/script.srt --plan input/edit_plan.json + python srt_video_editor.py --srt input/script.srt --plan input/edit_plan.json \\ + --source input/source.mp4 --temp-dir temp/ """ from __future__ import annotations @@ -17,6 +19,7 @@ import argparse import json import re +import subprocess import sys from pathlib import Path @@ -128,6 +131,86 @@ def validate_ids(cues: list[dict], plan: list[dict]) -> None: # ---------- report ---------- +def cut_clip(source: Path, start: float, end: float, out_path: Path) -> None: + """Cut [start, end] from source to out_path, re-encoded for frame accuracy. + + Keeps the original audio. `-ss` placed before `-i` makes ffmpeg do a + fast container-level seek to the nearest keyframe, then libx264 + re-encodes from there — frame-accurate at the cost of one encode pass. + + Raises SystemExit with the full ffmpeg command + stderr on failure so + the caller never has to scroll the terminal to find what went wrong. + """ + duration = end - start + cmd = [ + "ffmpeg", "-y", "-hide_banner", "-nostats", + "-ss", f"{start:.3f}", + "-i", str(source), + "-t", f"{duration:.3f}", + "-c:v", "libx264", "-preset", "fast", "-crf", "20", + "-c:a", "aac", "-b:a", "192k", + "-pix_fmt", "yuv420p", + "-movflags", "+faststart", + str(out_path), + ] + try: + proc = subprocess.run( + cmd, capture_output=True, text=True, + encoding="utf-8", errors="replace", + ) + except FileNotFoundError: + raise SystemExit( + "ffmpeg not found on PATH. Install ffmpeg " + "(`winget install Gyan.FFmpeg` on Windows, " + "`brew install ffmpeg` on macOS) and re-run." + ) + if proc.returncode != 0: + raise SystemExit( + f"ffmpeg failed on {out_path.name} (exit {proc.returncode})\n" + f"--- command ---\n{' '.join(cmd)}\n" + f"--- stderr ---\n{proc.stderr or '(empty)'}" + ) + + +def extract_clips( + cues: list[dict], + plan: list[dict], + source: Path, + temp_dir: Path, +) -> list[Path]: + """Cut one clip per cue. Returns the list of output paths in cue-id order. + + Filenames are `clip_.mp4`, indexed by SRT id (not position) so + each clip is traceable back to its cue at a glance even if ids are + sparse or non-consecutive. + """ + plan_by_id = {p["id"]: p for p in plan} + temp_dir.mkdir(parents=True, exist_ok=True) + + print() + print(f"cutting {len(cues)} clip(s) -> {temp_dir}/") + outputs: list[Path] = [] + for cue in sorted(cues, key=lambda c: c["id"]): + cid = cue["id"] + p = plan_by_id[cid] + start = p["source_start"] + end = p["source_end"] + duration = end - start + if duration <= 0: + raise SystemExit( + f"plan id={cid}: source_end {format_ts(end)} <= " + f"source_start {format_ts(start)} (duration {duration:.3f}s)" + ) + out_path = temp_dir / f"clip_{cid:03d}.mp4" + print( + f" id={cid:>3} {format_ts(start)} -> {format_ts(end)} " + f"({duration:.3f}s) -> {out_path}" + ) + cut_clip(source, start, end, out_path) + outputs.append(out_path) + return outputs + + def print_report(cues: list[dict], plan: list[dict]) -> None: plan_by_id = {p["id"]: p for p in plan} print(f"{len(cues)} cue(s), all ids matched.") @@ -152,15 +235,18 @@ def main() -> None: ap = argparse.ArgumentParser( description=( "Minimal SRT-driven editor. Reads script.srt + edit_plan.json, " - "validates id matching, prints the planned source range for each " - "cue. No ffmpeg, no actual cutting." + "validates id matching, prints the planned source range table, " + "then cuts each cue out of source.mp4 into temp/clip_*.mp4. " + "No concatenation yet." ), ) ap.add_argument("--srt", type=Path, default=Path("input/script.srt")) ap.add_argument("--plan", type=Path, default=Path("input/edit_plan.json")) + ap.add_argument("--source", type=Path, default=Path("input/source.mp4")) + ap.add_argument("--temp-dir", type=Path, default=Path("temp")) args = ap.parse_args() - for p in (args.srt, args.plan): + for p in (args.srt, args.plan, args.source): if not p.is_file(): raise SystemExit(f"file not found: {p}") @@ -168,6 +254,9 @@ def main() -> None: plan = parse_plan(args.plan) validate_ids(cues, plan) print_report(cues, plan) + extract_clips(cues, plan, args.source, args.temp_dir) + print() + print(f"done. {len(cues)} clip(s) in {args.temp_dir}/") if __name__ == "__main__": From 2ffe0675c8ce2345ae4ddab29dfa36eb982707cf Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Wed, 20 May 2026 21:53:01 +0800 Subject: [PATCH 09/18] feat(srt_video_editor): concat temp clips into output/final.mp4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the second ffmpeg pass. The cutting stage from the previous commit already encodes every clip with the same libx264 / aac parameters, so the concat demuxer can do a lossless `-c copy` join — no re-encode, near-instant, byte-for-byte fidelity to the cut clips. New function: concat_clips(clip_paths, out_path) Writes `/_concat.txt` with `file '...'` directives in cue-id order, invokes `ffmpeg -f concat -safe 0 -c copy`, and cleans up the list file in `finally` so both happy and unhappy paths leave a tidy temp/. Same error contract as cut_clip: full ffmpeg command + complete stderr on non-zero exit; friendly "ffmpeg not on PATH" instead of FileNotFoundError. CLI: one new flag, `--output`, default `output/final.mp4`. output/ auto-created. extract_clips now returns its list of output paths so main() can hand them to concat_clips without rebuilding the list. Smoke-verified end-to-end on a 10s synthetic source: 3 cues at 1.5s + 1.5s + 2.0s -> final.mp4 measured at 5.023s (~0.02s drift is normal: each per-segment libx264 encode rounds to its first keyframe). clips_out/ ends with exactly clip_001..003.mp4 and no leftover _concat.txt. Co-Authored-By: Claude Opus 4.7 --- srt_video_editor.py | 77 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 67 insertions(+), 10 deletions(-) diff --git a/srt_video_editor.py b/srt_video_editor.py index a3586c8..37c3843 100644 --- a/srt_video_editor.py +++ b/srt_video_editor.py @@ -1,17 +1,19 @@ """srt_video_editor — minimal viable version. Reads script.srt + edit_plan.json, validates that their ids match, -prints the planned source-time range for each cue, then cuts each cue's -range out of source.mp4 into temp/clip_.mp4 with ffmpeg. +prints the planned source-time range for each cue, cuts each cue's +range out of source.mp4 into temp/clip_.mp4, then concatenates +the clips in cue-id order into output/final.mp4. Stays self-contained on purpose: no imports from helpers/, so the whole -flow fits in one readable file. Concatenation, audio fades, subtitle -burn, etc. are NOT done here — they live in helpers/srt_driven_edit.py. +flow fits in one readable file. Audio fades, subtitle burn, color +grading, etc. are NOT done here — they live in helpers/srt_driven_edit.py. Usage: python srt_video_editor.py python srt_video_editor.py --srt input/script.srt --plan input/edit_plan.json \\ - --source input/source.mp4 --temp-dir temp/ + --source input/source.mp4 \\ + --temp-dir temp/ --output output/final.mp4 """ from __future__ import annotations @@ -211,6 +213,57 @@ def extract_clips( return outputs +def concat_clips(clip_paths: list[Path], out_path: Path) -> None: + """Lossless concat of pre-encoded clips via ffmpeg's concat demuxer. + + The clips produced by `cut_clip` all share the same encoder params + (libx264, yuv420p, aac), so `-c copy` is safe and instant — no + re-encode. The concat list file is written next to the first clip + (typically `temp/_concat.txt`) and removed in `finally` so a clean + run leaves a tidy temp/ and a failed run doesn't leave a stale list. + + Raises SystemExit with the full ffmpeg command + stderr on failure. + """ + if not clip_paths: + raise SystemExit("concat: no clips to concatenate") + + list_file = clip_paths[0].parent / "_concat.txt" + list_file.write_text( + "".join(f"file '{p.resolve().as_posix()}'\n" for p in clip_paths), + encoding="utf-8", + ) + out_path.parent.mkdir(parents=True, exist_ok=True) + cmd = [ + "ffmpeg", "-y", "-hide_banner", "-nostats", + "-f", "concat", "-safe", "0", + "-i", str(list_file), + "-c", "copy", + "-movflags", "+faststart", + str(out_path), + ] + try: + try: + proc = subprocess.run( + cmd, capture_output=True, text=True, + encoding="utf-8", errors="replace", + ) + except FileNotFoundError: + raise SystemExit( + "ffmpeg not found on PATH. Install ffmpeg " + "(`winget install Gyan.FFmpeg` on Windows, " + "`brew install ffmpeg` on macOS) and re-run." + ) + if proc.returncode != 0: + raise SystemExit( + f"ffmpeg concat failed (exit {proc.returncode})\n" + f"--- command ---\n{' '.join(cmd)}\n" + f"--- stderr ---\n{proc.stderr or '(empty)'}" + ) + finally: + list_file.unlink(missing_ok=True) + print(f" concat {len(clip_paths)} clip(s) -> {out_path}") + + def print_report(cues: list[dict], plan: list[dict]) -> None: plan_by_id = {p["id"]: p for p in plan} print(f"{len(cues)} cue(s), all ids matched.") @@ -235,15 +288,16 @@ def main() -> None: ap = argparse.ArgumentParser( description=( "Minimal SRT-driven editor. Reads script.srt + edit_plan.json, " - "validates id matching, prints the planned source range table, " - "then cuts each cue out of source.mp4 into temp/clip_*.mp4. " - "No concatenation yet." + "validates id matching, prints the planned range table, cuts " + "each cue out of source.mp4 into temp/clip_.mp4, then " + "lossless-concats the clips into output/final.mp4." ), ) ap.add_argument("--srt", type=Path, default=Path("input/script.srt")) ap.add_argument("--plan", type=Path, default=Path("input/edit_plan.json")) ap.add_argument("--source", type=Path, default=Path("input/source.mp4")) ap.add_argument("--temp-dir", type=Path, default=Path("temp")) + ap.add_argument("--output", type=Path, default=Path("output/final.mp4")) args = ap.parse_args() for p in (args.srt, args.plan, args.source): @@ -254,9 +308,12 @@ def main() -> None: plan = parse_plan(args.plan) validate_ids(cues, plan) print_report(cues, plan) - extract_clips(cues, plan, args.source, args.temp_dir) + clip_paths = extract_clips(cues, plan, args.source, args.temp_dir) + print() + print(f"concatenating -> {args.output}") + concat_clips(clip_paths, args.output) print() - print(f"done. {len(cues)} clip(s) in {args.temp_dir}/") + print(f"done. final video: {args.output}") if __name__ == "__main__": From 55f623c4047e0bc0a5d239a37f292b22d26027ad Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Wed, 20 May 2026 22:20:12 +0800 Subject: [PATCH 10/18] fix(batch): catch non-SystemExit errors so --continue-on-error is real MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Until now, --continue-on-error caught only SystemExit. A row with malformed plan.json would raise JSONDecodeError (ValueError subclass) straight through the loop and crash the whole batch — exactly the failure mode the flag was supposed to prevent. Same for FileNotFoundError when ffprobe disappears mid-run, and for any other unforeseen Exception. This commit hardens the path in three layers: 1. Source-side wrapping. parse_plan() and load_manifest() now wrap json.loads in try/except JSONDecodeError, re-raising as SystemExit with file path + line/col context. probe_streams() also catches FileNotFoundError (ffprobe missing) with an actionable install hint. Direct callers get a clean message, not a stack trace. 2. Loop-side widening. Both batch sites (srt_driven_edit.main's --batch path and run_episodes.run_episodes) now catch (SystemExit, Exception) — broad enough to cover everything except KeyboardInterrupt/GeneratorExit/SystemExit-from-Ctrl-C semantics. The failure record carries the exception type name so triage stays easy. 3. Failure records keep working. make_failure_record() already used getattr(e, 'stderr_tail', '') so it tolerates either SystemExit or Exception. Plain Exception leaves stderr_tail empty as designed. Tests (2 new, 18 total in this slice): - test_batch_continues_past_corrupt_plan_json: 3-row manifest with middle row's plan = "{ this is not json"; verifies row 0 succeeds, row 1 records JSONDecodeError, row 2 still runs. - test_run_episodes_continues_past_corrupt_plan_json: same shape for the directory-driven runner. Co-Authored-By: Claude Opus 4.7 --- helpers/run_episodes.py | 5 +-- helpers/srt_driven_edit.py | 56 ++++++++++++++++++++++++++-------- tests/test_run_episodes.py | 32 +++++++++++++++++++ tests/test_srt_driven_batch.py | 41 +++++++++++++++++++++++-- 4 files changed, 118 insertions(+), 16 deletions(-) diff --git a/helpers/run_episodes.py b/helpers/run_episodes.py index 847f406..8f6300b 100644 --- a/helpers/run_episodes.py +++ b/helpers/run_episodes.py @@ -155,9 +155,10 @@ def run_episodes( try: qc = run_job(job, ffmpeg_version) results.append(qc) - except SystemExit as e: + except (SystemExit, Exception) as e: if continue_on_error: - print(f"[{i + 1}/{len(eps)}] FAILED: {e}") + print(f"[{i + 1}/{len(eps)}] FAILED: " + f"{type(e).__name__}: {e}") results.append(make_failure_record( index=i, name=ep.name, error=e, job=job, )) diff --git a/helpers/srt_driven_edit.py b/helpers/srt_driven_edit.py index ac9d2f5..75904dd 100644 --- a/helpers/srt_driven_edit.py +++ b/helpers/srt_driven_edit.py @@ -273,8 +273,9 @@ def preflight() -> dict[str, str]: def probe_streams(path: Path) -> dict: """Probe a media file for {has_video, has_audio, duration}. - Raises SystemExit on probe failure so the caller doesn't continue - blindly. Result is cheap to memoize per source path. + Raises SystemExit on any probe failure (binary missing, bad file, + malformed output) so the caller doesn't continue blindly. Result + is cheap to memoize per source path. """ try: r = subprocess.run( @@ -287,11 +288,19 @@ def probe_streams(path: Path) -> dict: capture_output=True, text=True, check=True, encoding="utf-8", errors="replace", ) + except FileNotFoundError: + raise SystemExit( + "ffprobe not on PATH. Install ffmpeg " + "(`winget install Gyan.FFmpeg` / `brew install ffmpeg`)." + ) except subprocess.CalledProcessError as e: raise SystemExit( f"ffprobe failed on {path}: {(e.stderr or '')[:300]}" ) - data = json.loads(r.stdout) + try: + data = json.loads(r.stdout) + except json.JSONDecodeError as e: + raise SystemExit(f"ffprobe returned malformed JSON for {path}: {e}") types: set[str] = set() for s in data.get("streams", []) or []: t = s.get("codec_type") @@ -465,7 +474,17 @@ def validate_srt(cues: list[SrtCue]) -> None: def parse_plan(path: Path) -> tuple[dict[str, Path], dict[str, Path], list[PlanEntry]]: """Returns (sources_map, voices_map, entries). Detects Form A vs B.""" - data = json.loads(path.read_text(encoding="utf-8")) + try: + raw = path.read_text(encoding="utf-8") + except OSError as e: + raise SystemExit(f"edit_plan unreadable: {path}: {e}") + try: + data = json.loads(raw) + except json.JSONDecodeError as e: + raise SystemExit( + f"edit_plan is not valid JSON: {path}: " + f"line {e.lineno} col {e.colno}: {e.msg}" + ) base = path.parent if isinstance(data, list): @@ -1432,15 +1451,28 @@ def make_failure_record( def load_manifest(path: Path) -> list[dict]: suffix = path.suffix.lower() if suffix == ".json": - data = json.loads(path.read_text(encoding="utf-8")) + try: + raw = path.read_text(encoding="utf-8") + except OSError as e: + raise SystemExit(f"batch manifest unreadable: {path}: {e}") + try: + data = json.loads(raw) + except json.JSONDecodeError as e: + raise SystemExit( + f"batch manifest is not valid JSON: {path}: " + f"line {e.lineno} col {e.colno}: {e.msg}" + ) if not isinstance(data, list): raise SystemExit("batch manifest JSON must be an array of job dicts") return data if suffix == ".csv": rows: list[dict] = [] - with path.open(newline="", encoding="utf-8-sig") as f: - for row in csv.DictReader(f): - rows.append({k: v for k, v in row.items() if v != ""}) + try: + with path.open(newline="", encoding="utf-8-sig") as f: + for row in csv.DictReader(f): + rows.append({k: v for k, v in row.items() if v != ""}) + except (OSError, csv.Error) as e: + raise SystemExit(f"batch manifest CSV error: {path}: {e}") return rows raise SystemExit(f"unsupported manifest format: {suffix}") @@ -1553,9 +1585,9 @@ def main() -> None: for i, row in enumerate(rows): try: job = job_from_dict(row, args, manifest_path.parent, i) - except SystemExit as e: + except (SystemExit, Exception) as e: if args.continue_on_error: - print(f"[batch {i}] skipped: {e}") + print(f"[batch {i}] skipped: {type(e).__name__}: {e}") results.append(make_failure_record( index=i, name=row.get("name", f"row{i}"), error=e, job=None, manifest_row=row, @@ -1564,9 +1596,9 @@ def main() -> None: raise try: results.append(run_job(job, versions["ffmpeg"])) - except SystemExit as e: + except (SystemExit, Exception) as e: if args.continue_on_error: - print(f"[batch {i}] FAILED: {e}") + print(f"[batch {i}] FAILED: {type(e).__name__}: {e}") results.append(make_failure_record( index=i, name=job.name, error=e, job=job, )) diff --git a/tests/test_run_episodes.py b/tests/test_run_episodes.py index 754ce42..956cf52 100644 --- a/tests/test_run_episodes.py +++ b/tests/test_run_episodes.py @@ -177,6 +177,38 @@ def test_run_episodes_failure_record_includes_paths( assert rec["stderr_tail"] == "" +def test_run_episodes_continues_past_corrupt_plan_json( + runner, helpers_ns, ffmpeg_version, synth_av, tmp_path +): + """A non-SystemExit (JSONDecodeError) inside run_job must NOT abort + --continue-on-error. Pre-fix the loop only caught SystemExit, so a + malformed edit_plan.json in one ep would crash the whole batch.""" + batch = tmp_path / "batch" + _make_ep(batch / "ep01", synth_av, helpers_ns) + # ep02: valid SRT + source, but plan.json is garbage + ep02 = batch / "ep02" + ep02.mkdir() + import shutil + shutil.copy2(synth_av, ep02 / "source.mp4") + helpers_ns.write_srt(ep02 / "script.srt", [(1, 0.0, 1.5, "x")]) + (ep02 / "edit_plan.json").write_text("{ not json", encoding="utf-8") + # ep03 should still run + _make_ep(batch / "ep03", synth_av, helpers_ns) + + summary = runner.run_episodes( + batch, ffmpeg_version=ffmpeg_version, + continue_on_error=True, + ) + assert summary["episodes_total"] == 3 + assert summary["ok"] == 2 + + failed = [r for r in summary["results"] if not r.get("ok")] + assert len(failed) == 1 and failed[0]["job"] == "ep02" + assert "JSON" in failed[0]["error"] or "json" in failed[0]["error"] + # ep03 (the post-bad one) must have run + assert (batch / "ep03" / "final.mp4").exists() + + def test_run_episodes_per_ep_voice( runner, helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path ): diff --git a/tests/test_srt_driven_batch.py b/tests/test_srt_driven_batch.py index 8a41c90..37733d5 100644 --- a/tests/test_srt_driven_batch.py +++ b/tests/test_srt_driven_batch.py @@ -52,7 +52,7 @@ def run_batch(helpers_ns, manifest_path, ffmpeg_version, *, for i, row in enumerate(rows): try: job = sde.job_from_dict(row, defaults, manifest_path.parent, i) - except SystemExit as e: + except (SystemExit, Exception) as e: if continue_on_error: results.append(sde.make_failure_record( index=i, name=row.get("name", f"row{i}"), @@ -62,7 +62,7 @@ def run_batch(helpers_ns, manifest_path, ffmpeg_version, *, raise try: results.append(sde.run_job(job, ffmpeg_version)) - except SystemExit as e: + except (SystemExit, Exception) as e: if continue_on_error: results.append(sde.make_failure_record( index=i, name=job.name, error=e, job=job, @@ -268,6 +268,43 @@ def test_batch_malformed_row_failure_record(helpers_ns, ffmpeg_version, tmp_path assert failed["stderr_tail"] == "" +def test_batch_continues_past_corrupt_plan_json( + helpers_ns, ffmpeg_version, synth_av, tmp_path +): + """A row whose plan.json is malformed must NOT abort the batch under + --continue-on-error. JSONDecodeError used to escape the loop because + we only caught SystemExit; the failure record now captures it. + """ + # Good row + helpers_ns.write_srt(tmp_path / "s_ok.srt", CUES_2) + helpers_ns.write_plan_form_a(tmp_path / "p_ok.json", PLAN_2) + # Bad plan: not valid JSON + helpers_ns.write_srt(tmp_path / "s_bad.srt", CUES_2) + (tmp_path / "p_bad.json").write_text("{ this is not json", encoding="utf-8") + # Another good row after the bad one — must still run + helpers_ns.write_srt(tmp_path / "s_ok2.srt", CUES_2) + helpers_ns.write_plan_form_a(tmp_path / "p_ok2.json", PLAN_2) + + manifest_path = tmp_path / "jobs.json" + manifest_path.write_text(json.dumps([ + {"name": "ok0", "source": str(synth_av), + "srt": "s_ok.srt", "plan": "p_ok.json"}, + {"name": "broken", "source": str(synth_av), + "srt": "s_bad.srt", "plan": "p_bad.json"}, + {"name": "ok2", "source": str(synth_av), + "srt": "s_ok2.srt", "plan": "p_ok2.json"}, + ]), encoding="utf-8") + + results = run_batch(helpers_ns, manifest_path, ffmpeg_version, + continue_on_error=True) + assert len(results) == 3 + assert results[0]["ok"] is True + assert results[1]["ok"] is False + assert "JSON" in results[1]["error"] or "json" in results[1]["error"] + assert results[1]["plan"] and results[1]["plan"].endswith("p_bad.json") + assert results[2]["ok"] is True + + def test_batch_per_job_bg_volume(helpers_ns, ffmpeg_version, synth_av, tmp_path): helpers_ns.write_srt(tmp_path / "s.srt", CUES_2) helpers_ns.write_plan_form_a(tmp_path / "p.json", PLAN_2) From 0bc07c59e0e099c5cf6895d4960551f696754878 Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Wed, 20 May 2026 22:23:14 +0800 Subject: [PATCH 11/18] docs(srt_video_editor): mark as learning-grade, redirect to main.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The minimal editor is being mistaken for a production entry point. It intentionally lacks encoding fallback (GBK / GB18030 SRT crashes), source range bounds (a plan that overruns the source surfaces as a confusing ffmpeg error, not a clear "id=X exceeds duration"), QC artifacts, and overwrite protection (`-y` silently clobbers temp/ and output/ on every run). The fixes for all of those already live in helpers/srt_driven_edit.py and behind `python main.py`. Rather than reinvent those features inside srt_video_editor.py — which would defeat its "read it top to bottom in one sitting" purpose — make the framing explicit: - Module docstring now opens with a banner that says THIS IS NOT THE PRODUCTION ENTRY POINT and lists every shortcut taken. - argparse description names the omissions and points at main.py. - First line of runtime output is a one-line minimal-mode notice so users who only read terminal output also see the redirect. No behavior change; existing smoke run on the 10s lavfi synth still produces a 5-second concat with the notice prepended. Co-Authored-By: Claude Opus 4.7 --- srt_video_editor.py | 59 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/srt_video_editor.py b/srt_video_editor.py index 37c3843..b1c3d68 100644 --- a/srt_video_editor.py +++ b/srt_video_editor.py @@ -1,13 +1,35 @@ -"""srt_video_editor — minimal viable version. - -Reads script.srt + edit_plan.json, validates that their ids match, -prints the planned source-time range for each cue, cuts each cue's -range out of source.mp4 into temp/clip_.mp4, then concatenates -the clips in cue-id order into output/final.mp4. - -Stays self-contained on purpose: no imports from helpers/, so the whole -flow fits in one readable file. Audio fades, subtitle burn, color -grading, etc. are NOT done here — they live in helpers/srt_driven_edit.py. +"""srt_video_editor — minimal viable, learning-grade scaffold. + +================================================================ +THIS IS NOT THE PRODUCTION ENTRY POINT. Use `python main.py` (or +`python helpers/srt_driven_edit.py` directly) for any real work. +================================================================ + +What this script does: + - reads script.srt + edit_plan.json + - validates ids match + - prints the planned mapping + - cuts each cue out of source.mp4 to temp/clip_.mp4 + - lossless-concats into output/final.mp4 + +What it deliberately DOES NOT do (use main.py / srt_driven_edit.py +for any of these): + - encoding fallback — only UTF-8 / UTF-8-with-BOM SRT is accepted; + GB18030 / cp936 input will crash + - source range bounds check — a plan that overruns the source's + duration will surface as a confusing ffmpeg error, not a clear + "id=X exceeds source duration" up front + - QC report — no per-clip drift, no disk-usage accounting, no + structured failure record + - overwrite protection — every run silently `-y` overwrites the + temp/ clips and output/final.mp4 + - audio fades at cut points (you may hear pops on hard cuts) + - voice replacement, subtitle burn, color grade, HDR tone-map, + sync tails, segment cache, batch / per-episode discovery + +Self-contained on purpose: no imports from helpers/, so the entire +flow fits in one readable file. Use this to learn the pipeline; ship +with main.py. Usage: python srt_video_editor.py @@ -287,10 +309,13 @@ def print_report(cues: list[dict], plan: list[dict]) -> None: def main() -> None: ap = argparse.ArgumentParser( description=( - "Minimal SRT-driven editor. Reads script.srt + edit_plan.json, " - "validates id matching, prints the planned range table, cuts " - "each cue out of source.mp4 into temp/clip_.mp4, then " - "lossless-concats the clips into output/final.mp4." + "MINIMAL learning-grade SRT-driven editor. NOT for production — " + "use `python main.py` for that. This script reads script.srt + " + "edit_plan.json, validates id matching, prints the planned range " + "table, cuts each cue out of source.mp4 into temp/clip_.mp4, " + "then lossless-concats them into output/final.mp4. No encoding " + "fallback, no range-bounds check, no QC report, no overwrite " + "protection." ), ) ap.add_argument("--srt", type=Path, default=Path("input/script.srt")) @@ -300,6 +325,12 @@ def main() -> None: ap.add_argument("--output", type=Path, default=Path("output/final.mp4")) args = ap.parse_args() + print( + "[srt_video_editor: minimal mode — UTF-8 SRT only, no range/QC " + "checks, temp/ + output/ will be overwritten. For production " + "use `python main.py`.]" + ) + for p in (args.srt, args.plan, args.source): if not p.is_file(): raise SystemExit(f"file not found: {p}") From 7094eb81e3503bd1f87e748fcfe01184e8a960d6 Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Wed, 20 May 2026 22:29:32 +0800 Subject: [PATCH 12/18] feat(recommend): monotonic-source guard + source-time discontinuity warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The greedy per-cue best-match in assign() makes no assumption about narrative time direction. When the same line appears multiple times in the source recording (very common: opening tags, repeated greetings, brand mentions, narrator interjections), the matcher can pick the LATER instance for an earlier cue and the EARLIER instance for a later cue. The ids align, durations align, scores look great — and the finished cut jumps backward in scene time without anyone noticing until playback. Two new knobs to defend against this: --monotonic-source HARD constraint. assign() skips candidates whose start time is before the previous match's end. If no forward candidate exists, the cue gets an explicit "no candidate at or after T (monotonic constraint)" warning and the run hard-fails per the no-candidate contract — exactly the loud signal you want. --max-source-gap N SOFT threshold (seconds). Any adjacent assignment pair whose |source-time gap| exceeds N earns a warning, in BOTH monotonic and non-monotonic modes. Even with neither flag set, a backward source-time jump now ALWAYS adds a warning to the later cue. The review markdown surfaces these in the existing per-cue WARNING list, so manual QA catches the problem without needing to read the source ranges side by side. assign()'s structure is unchanged: pick best match per cue first, then the new post-pass scans adjacent pairs for backward/big-gap issues. The monotonic constraint short-circuits the inner candidate loop directly so impossible picks are never considered. Tests (4 new, 15 total in this slice): - backward_source_jump_warns_by_default: constructs the exact confound (cue 1 prefers a longer later instance; cue 2 gets stuck with an earlier instance) and verifies the warning fires. - monotonic_source_prevents_backward_jump: same setup + a third forward-only instance; under --monotonic-source, cue 2 picks forward and no backward warning appears. - max_source_gap_warning: 63s gap between matches, --max-source-gap 10 fires the warning. - monotonic_with_no_forward_candidate_fails: only backward candidate available; SystemExit per the no-candidate-found contract. Co-Authored-By: Claude Opus 4.7 --- helpers/recommend_edit_plan.py | 71 ++++++++++++++++- tests/test_recommend_edit_plan.py | 127 ++++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+), 1 deletion(-) diff --git a/helpers/recommend_edit_plan.py b/helpers/recommend_edit_plan.py index 14b2c8a..7db1c21 100644 --- a/helpers/recommend_edit_plan.py +++ b/helpers/recommend_edit_plan.py @@ -295,29 +295,61 @@ def assign( allow_reuse: bool = False, min_score: float = 0.35, duration_warn_ratio: float = 0.5, + monotonic_source: bool = False, + max_source_gap_warn: float | None = None, ) -> list[Assignment]: + """Pick the best candidate for each cue in id order. + + monotonic_source: when True, a candidate is only considered if its + start time is >= the previously assigned candidate's end. Prevents + narrative time reversal when the same line appears multiple times + in the source (the matcher can otherwise pick an earlier instance + for a later cue). + + max_source_gap_warn: if set, any adjacent assignment pair whose + absolute source-time gap exceeds the threshold gets a warning. + Soft signal — does not affect selection. + + Even in non-monotonic mode, a backward source-time jump always + earns a warning so the review markdown surfaces it. + """ used: set[int] = set() out: list[Assignment] = [] + # Floor that the NEXT candidate's start must clear under monotonic mode. + min_start_floor = 0.0 + for cue in cues: best_idx = -1 best_score = -1.0 for i, cand in enumerate(candidates): if not allow_reuse and i in used: continue + if monotonic_source and cand.start < min_start_floor - 1e-6: + continue s = combined_score(cue, cand) if s > best_score: best_score = s best_idx = i + warns: list[str] = [] cand_out: Candidate | None = None if best_idx < 0: - warns.append("no candidate available") + if monotonic_source: + warns.append( + f"no candidate available at or after source time " + f"{format_srt_ts(min_start_floor)} (monotonic constraint)" + ) + else: + warns.append("no candidate available") score_out = 0.0 else: cand_out = candidates[best_idx] score_out = best_score if not allow_reuse: used.add(best_idx) + if monotonic_source: + # Next cue must start at or after this candidate's end. + min_start_floor = cand_out.end if best_score < min_score: warns.append(f"low score {best_score:.3f} < {min_score}") if cue.duration > 0: @@ -336,6 +368,29 @@ def assign( cue_id=cue.id, cue_text=cue.text, cue_duration=cue.duration, cand=cand_out, score=score_out, warnings=warns, )) + + # Post-pass: surface source-time discontinuities as warnings on the + # later cue of the pair. Backward jumps are flagged in non-monotonic + # mode (impossible by construction in monotonic mode). Large gaps are + # flagged in both modes when --max-source-gap is set. + for i in range(1, len(out)): + prev_cand = out[i - 1].cand + curr_cand = out[i].cand + if prev_cand is None or curr_cand is None: + continue + gap = curr_cand.start - prev_cand.end + if not monotonic_source and gap < -1e-3: + out[i].warnings.append( + f"source time goes backward {gap:+.2f}s: prev cue ends at " + f"{format_srt_ts(prev_cand.end)}, this cue starts at " + f"{format_srt_ts(curr_cand.start)}" + ) + if max_source_gap_warn is not None and abs(gap) > max_source_gap_warn: + out[i].warnings.append( + f"source-time jump {gap:+.2f}s exceeds " + f"--max-source-gap {max_source_gap_warn:.2f}s" + ) + return out @@ -440,6 +495,8 @@ def recommend( min_score: float = 0.35, allow_reuse: bool = False, keep_audio_events: bool = False, + monotonic_source: bool = False, + max_source_gap_warn: float | None = None, ) -> list[Assignment]: cues = _parse_srt(script_srt) if not cues: @@ -461,6 +518,8 @@ def recommend( assignments = assign( cues, candidates, allow_reuse=allow_reuse, min_score=min_score, + monotonic_source=monotonic_source, + max_source_gap_warn=max_source_gap_warn, ) if output_format == "form-a": @@ -520,6 +579,14 @@ def main() -> None: help="allow one candidate to be assigned to multiple cues") ap.add_argument("--keep-audio-events", action="store_true", help="keep (laughter) (applause) tokens as candidate context") + ap.add_argument("--monotonic-source", action="store_true", + help="require each cue's source range to start at or after " + "the previous cue's match. Prevents narrative time " + "reversal when the same line appears multiple times " + "in the source.") + ap.add_argument("--max-source-gap", type=float, default=None, + help="seconds. When set, any adjacent assignment whose " + "|source-time gap| exceeds this earns a warning.") ap.add_argument("--format", choices=["form-a", "form-b"], default="form-a", dest="output_format") ap.add_argument("-o", "--output", type=Path, required=True, @@ -542,6 +609,8 @@ def main() -> None: min_score=args.min_score, allow_reuse=args.allow_reuse, keep_audio_events=args.keep_audio_events, + monotonic_source=args.monotonic_source, + max_source_gap_warn=args.max_source_gap, ) matched = sum(1 for a in assignments if a.cand is not None) diff --git a/tests/test_recommend_edit_plan.py b/tests/test_recommend_edit_plan.py index 1c4dd0e..4453cde 100644 --- a/tests/test_recommend_edit_plan.py +++ b/tests/test_recommend_edit_plan.py @@ -313,6 +313,133 @@ def test_review_markdown_content(rec, helpers_ns, tmp_path): # --------------------------------------------------------------------------- +def test_backward_source_jump_warns_by_default(rec, helpers_ns, tmp_path): + """When a later cue matches an earlier source position, a warning fires. + + Two cues both want a line that appears twice in the source. Greedy + matching with no constraint picks the EARLIEST instance for the + earlier-ID cue (because Jaccard score breaks ties by first hit), then + the SECOND instance for the later cue — so source time advances and + no warning. We construct the inverse: make the earlier-ID cue prefer + the LATER instance (longer duration → better duration_similarity), + leaving only the earlier instance for the later cue, producing a + backward jump that must be flagged. + """ + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + # Cue 1 prefers a 2.0s match; cue 2 prefers a 1.0s match. + helpers_ns.write_srt(srt, [ + (1, 0.0, 2.0, "alpha alpha alpha"), + (2, 2.0, 3.0, "alpha alpha alpha"), + ]) + write_transcript(transcript, [ + # Early instance: 1.0s duration → cue 2 will prefer it + {"text": "alpha", "start": 5.0, "end": 5.4, "type": "word"}, + {"text": "alpha", "start": 5.4, "end": 5.7, "type": "word"}, + {"text": "alpha.", "start": 5.7, "end": 6.0, "type": "word"}, + # Late instance: 2.0s duration → cue 1 will prefer it + {"text": "alpha", "start": 20.0, "end": 20.7, "type": "word"}, + {"text": "alpha", "start": 20.7, "end": 21.4, "type": "word"}, + {"text": "alpha.", "start": 21.4, "end": 22.0, "type": "word"}, + ]) + + assignments = rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + ) + # Cue 1 picked the 2s late instance, cue 2 picked the 1s early one → backward + assert assignments[0].cand.start >= 20.0 + assert assignments[1].cand.start <= 6.0 + backward_warnings = [w for w in assignments[1].warnings + if "backward" in w] + assert backward_warnings, \ + f"expected a backward-time warning on cue 2, got: {assignments[1].warnings}" + + +def test_monotonic_source_prevents_backward_jump(rec, helpers_ns, tmp_path): + """With --monotonic-source the same setup must NOT pick the early + instance for cue 2. The constraint forces cue 2's candidate to start + at or after cue 1's end.""" + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [ + (1, 0.0, 2.0, "alpha alpha alpha"), + (2, 2.0, 3.0, "alpha alpha alpha"), + ]) + # Same as the previous test PLUS a third late instance so cue 2 has a + # forward option under the constraint. + write_transcript(transcript, [ + {"text": "alpha", "start": 5.0, "end": 5.4, "type": "word"}, + {"text": "alpha", "start": 5.4, "end": 5.7, "type": "word"}, + {"text": "alpha.", "start": 5.7, "end": 6.0, "type": "word"}, + {"text": "alpha", "start": 20.0, "end": 20.7, "type": "word"}, + {"text": "alpha", "start": 20.7, "end": 21.4, "type": "word"}, + {"text": "alpha.", "start": 21.4, "end": 22.0, "type": "word"}, + {"text": "alpha", "start": 30.0, "end": 30.4, "type": "word"}, + {"text": "alpha", "start": 30.4, "end": 30.7, "type": "word"}, + {"text": "alpha.", "start": 30.7, "end": 31.0, "type": "word"}, + ]) + + assignments = rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + monotonic_source=True, + ) + assert assignments[0].cand.start >= 20.0 + assert assignments[1].cand.start >= assignments[0].cand.end - 1e-6, \ + "cue 2's candidate must start at or after cue 1's end under monotonic" + # No backward warning under monotonic mode + assert not any("backward" in w for w in assignments[1].warnings) + + +def test_max_source_gap_warning(rec, helpers_ns, tmp_path): + """--max-source-gap fires a warning when the gap exceeds the threshold.""" + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [ + (1, 0.0, 1.0, "alpha"), + (2, 1.0, 2.0, "beta"), + ]) + write_transcript(transcript, [ + {"text": "alpha.", "start": 1.0, "end": 1.5, "type": "word"}, + # Big gap to next: beta is at 60+ seconds away + {"text": "beta.", "start": 65.0, "end": 65.5, "type": "word"}, + ]) + assignments = rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + max_source_gap_warn=10.0, # gap is ~63.5s, well over 10s + ) + jump_warnings = [w for w in assignments[1].warnings + if "source-time jump" in w] + assert jump_warnings, \ + f"expected a big-gap warning, got: {assignments[1].warnings}" + + +def test_monotonic_with_no_forward_candidate_fails(rec, helpers_ns, tmp_path): + """If no candidate can satisfy the monotonic constraint, the cue gets + the 'no candidate available at or after ...' warning and write_plan + hard-fails (per the no-candidate contract).""" + srt = tmp_path / "script.srt" + transcript = tmp_path / "transcript.json" + helpers_ns.write_srt(srt, [ + (1, 0.0, 1.0, "alpha"), + (2, 1.0, 2.0, "beta"), + ]) + write_transcript(transcript, [ + # alpha matches at 20s, taking cue 1 + {"text": "alpha.", "start": 20.0, "end": 21.0, "type": "word"}, + # beta only available BEFORE alpha — monotonic can't reach it + {"text": "beta.", "start": 5.0, "end": 6.0, "type": "word"}, + ]) + with pytest.raises(SystemExit): + rec.recommend( + script_srt=srt, transcript=transcript, source=Path("fake.mp4"), + output=tmp_path / "plan.json", + monotonic_source=True, + ) + + def test_e2e_recommend_then_render( rec, sde, helpers_ns, synth_av, tmp_path ): From a6e686dd4c41f9a08b85751c4a1f7a31fa41c64e Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Wed, 20 May 2026 22:46:30 +0800 Subject: [PATCH 13/18] feat(srt_video_editor): cut to SRT duration, validate, clean stale clips MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refines the per-cue cut to honour the SRT timeline as the authoritative target rather than the source range from edit_plan.json. The plan defines WHERE to cut from in the source; the SRT defines HOW MUCH to keep. When the two disagree, the SRT wins or the run aborts — never silently produces a clip with the wrong runtime. Per-cue duration logic: source_duration = plan.source_end - plan.source_start srt_duration = cue.end - cue.start source_duration <= 0 -> hard error, points at the id source_duration < srt_duration -> hard error with both durations, suggests extending the source range or shortening the cue source_duration >= srt_duration -> cut exactly srt_duration. Any extra source tail is discarded. This keeps every clip's runtime aligned with its caption window, so the concat output's total runtime matches the SRT timeline within the encoder's keyframe rounding. ffmpeg call tightened: - timestamps now passed in HH:MM:SS.ms form (the dot variant ffmpeg expects), not raw seconds. New format_ts_dot() helper. - `-map 0:v:0 -map 0:a?` explicitly select the first video stream and an optional audio stream — a video-only source no longer crashes the run. - `-preset veryfast -crf 18` for sharper picture at slightly higher encode cost. - `-ar 48000 -ac 2` standardise audio to 48 kHz stereo so per-clip re-encodes share parameters, making the downstream `-c copy` concat safe. Stale clip_*.mp4 in the temp dir are removed before cutting so a previous run with sparser ids does not leave misleading neighbours. The glob pattern is `clip_*.mp4` only — user-placed files (notes, recordings, etc.) are untouched. _concat.txt is owned by concat_clips and stays out of this cleanup. Per-clip log now includes the cue subtitle text (truncated to 60 chars) so the terminal output reads as a human-checkable record. Smoke-verified four scenarios on the 10s lavfi synth: 1. source == srt durations: identical to previous behaviour (5.021s output for 5.0s SRT total — same keyframe drift as before). 2. source > srt: 4s and 3s source ranges with 1s SRT cues each → clips measured at 1.000s exactly. 3. source < srt (id=2: 0.5s source vs 2s SRT): clip 1 cuts; run aborts with `plan id=2: source range is shorter than SRT cue. source_duration=0.500s, srt_duration=2.000s. ...` 4. stale cleanup: pre-existing clip_998.mp4 + clip_999.mp4 removed, unrelated keepme.txt preserved. cut_clip signature is now (source, start, cut_duration, out_path), replacing the prior (source, start, end, out_path) — internal to this file, no external callers. Co-Authored-By: Claude Opus 4.7 --- srt_video_editor.py | 106 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 83 insertions(+), 23 deletions(-) diff --git a/srt_video_editor.py b/srt_video_editor.py index b1c3d68..5818d1d 100644 --- a/srt_video_editor.py +++ b/srt_video_editor.py @@ -63,6 +63,7 @@ def parse_ts(s: str) -> float: def format_ts(seconds: float) -> str: + """SRT-style HH:MM:SS,ms — comma separator (for log output / errors).""" total_ms = int(round(seconds * 1000)) h, rem = divmod(total_ms, 3600_000) m, rem = divmod(rem, 60_000) @@ -70,6 +71,16 @@ def format_ts(seconds: float) -> str: return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" +def format_ts_dot(seconds: float) -> str: + """ffmpeg-style HH:MM:SS.ms — dot separator. Used for `-ss` / `-t` args. + + SRT timestamps use a comma between seconds and milliseconds; ffmpeg + expects a dot. The two forms refer to the same point in time but the + comma form is rejected by ffmpeg's parser. + """ + return format_ts(seconds).replace(",", ".") + + # ---------- parsers ---------- @@ -155,25 +166,34 @@ def validate_ids(cues: list[dict], plan: list[dict]) -> None: # ---------- report ---------- -def cut_clip(source: Path, start: float, end: float, out_path: Path) -> None: - """Cut [start, end] from source to out_path, re-encoded for frame accuracy. +def cut_clip(source: Path, start: float, cut_duration: float, + out_path: Path) -> None: + """Cut `cut_duration` seconds starting at `start` from source. + + `-ss` placed before `-i` makes ffmpeg do a fast container-level seek + to the nearest keyframe, then libx264 re-encodes from there — + frame-accurate at the cost of one encode pass. Stream copy (`-c copy`) + would be faster but cuts at keyframes only, which makes downstream + concat / sync less predictable; we trade a few seconds of encode + time per clip for cleaner cut boundaries. - Keeps the original audio. `-ss` placed before `-i` makes ffmpeg do a - fast container-level seek to the nearest keyframe, then libx264 - re-encodes from there — frame-accurate at the cost of one encode pass. + Audio is mapped optionally via `-map 0:a?` so a video-only source + does not crash the run. Video is the first stream (`-map 0:v:0`). - Raises SystemExit with the full ffmpeg command + stderr on failure so - the caller never has to scroll the terminal to find what went wrong. + Raises SystemExit with the full ffmpeg command + stderr on failure + so the caller never has to scroll the terminal to find what went wrong. """ - duration = end - start cmd = [ "ffmpeg", "-y", "-hide_banner", "-nostats", - "-ss", f"{start:.3f}", + "-ss", format_ts_dot(start), "-i", str(source), - "-t", f"{duration:.3f}", - "-c:v", "libx264", "-preset", "fast", "-crf", "20", - "-c:a", "aac", "-b:a", "192k", + "-t", format_ts_dot(cut_duration), + "-map", "0:v:0", + "-map", "0:a?", + "-c:v", "libx264", "-preset", "veryfast", "-crf", "18", "-pix_fmt", "yuv420p", + "-c:a", "aac", "-b:a", "192k", + "-ar", "48000", "-ac", "2", "-movflags", "+faststart", str(out_path), ] @@ -204,13 +224,37 @@ def extract_clips( ) -> list[Path]: """Cut one clip per cue. Returns the list of output paths in cue-id order. - Filenames are `clip_.mp4`, indexed by SRT id (not position) so - each clip is traceable back to its cue at a glance even if ids are - sparse or non-consecutive. + Per-cue duration logic: + source_duration = plan.source_end - plan.source_start + srt_duration = cue.end - cue.start + + source_duration <= 0 -> hard error pointing at the id + source_duration < srt_duration -> hard error (source is too short + to cover the SRT cue; either + extend the source range or + shorten the cue) + source_duration >= srt_duration -> cut exactly `srt_duration` + starting at source_start. Any + extra source tail is discarded. + + Stale `clip_*.mp4` files in `temp_dir` are removed before cutting so + a previous failed run with sparser ids doesn't leave misleading + leftovers next to the new clips. The `_concat.txt` from a future + concat step is NOT touched here — concat owns its own list file. + + Filenames are `clip_.mp4`, indexed by SRT id (not position). """ plan_by_id = {p["id"]: p for p in plan} temp_dir.mkdir(parents=True, exist_ok=True) + # Pre-clean stale clip files. Only the clip_*.mp4 pattern so user- + # created neighbours (notes, recordings, etc.) are left alone. + stale = sorted(temp_dir.glob("clip_*.mp4")) + if stale: + print(f"clearing {len(stale)} stale clip(s) from {temp_dir}/") + for p in stale: + p.unlink() + print() print(f"cutting {len(cues)} clip(s) -> {temp_dir}/") outputs: list[Path] = [] @@ -218,19 +262,35 @@ def extract_clips( cid = cue["id"] p = plan_by_id[cid] start = p["source_start"] - end = p["source_end"] - duration = end - start - if duration <= 0: + source_duration = p["source_end"] - start + srt_duration = cue["end"] - cue["start"] + + if source_duration <= 0: raise SystemExit( - f"plan id={cid}: source_end {format_ts(end)} <= " - f"source_start {format_ts(start)} (duration {duration:.3f}s)" + f"plan id={cid}: source_end {format_ts(p['source_end'])} <= " + f"source_start {format_ts(start)} " + f"(source_duration {source_duration:.3f}s)" ) + if source_duration < srt_duration - 1e-6: + raise SystemExit( + f"plan id={cid}: source range is shorter than SRT cue. " + f"source_duration={source_duration:.3f}s, " + f"srt_duration={srt_duration:.3f}s. " + f"Extend the source range or shorten the SRT cue." + ) + # source_duration >= srt_duration: cut exactly srt_duration + cut_duration = srt_duration + out_path = temp_dir / f"clip_{cid:03d}.mp4" + text_preview = cue["text"].replace("\n", " ").strip() + if len(text_preview) > 60: + text_preview = text_preview[:57] + "..." print( - f" id={cid:>3} {format_ts(start)} -> {format_ts(end)} " - f"({duration:.3f}s) -> {out_path}" + f" id={cid:>3} src@{format_ts(start)} " + f"cut={cut_duration:.3f}s -> {out_path}\n" + f" text: {text_preview!r}" ) - cut_clip(source, start, end, out_path) + cut_clip(source, start, cut_duration, out_path) outputs.append(out_path) return outputs From fd107e20d5235996df3658017e5a17abbc04e7c9 Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Thu, 21 May 2026 00:09:40 +0800 Subject: [PATCH 14/18] feat: --mode extract stops the pipeline after segment extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new pipeline-stage selector to both srt_driven_edit (single job + batch manifest) and run_episodes (directory batch). main.py is a thin argv-forwarder so `python main.py --mode extract` works without any change to that file. Behavior: --mode full (default) extract -> concat -> voice mix -> subtitle burn -> QC report. Unchanged. --mode extract Stop after per-cue segment extraction. Real source segments are copied to /extracted_clips_/clip_.mp4 Synthetic black+silence gap clips are NOT generated (they are a concat-time concept). The output mp4 path is not touched, the overwrite-protection check is skipped, and the QC report stage is skipped. The EDL is still written — useful for inspecting the cue -> source mapping that produced the clips. Naming: extracted clip filenames are `clip_.mp4`, matching the convention srt_video_editor.py established at the project root. The work_dir's internal seg_NN_idXX.mp4 layout stays unchanged so caching keeps working across modes. Cleanup: a stale `clip_*.mp4` in extracted_clips_/ is removed before the new clips are copied in, so a previous run with a sparser plan doesn't leave misleading neighbours. Non-clip files (notes.txt, recordings, etc.) are untouched. Wiring: - Job dataclass gains `mode: str = "full"`. - srt_driven_edit CLI adds --mode (choices: full | extract). - job_from_dict accepts "mode" per manifest row, defaulting to the CLI's mode, with validation; an invalid value fails the row. - run_episodes adds matching --mode + the kwarg threaded through run_episodes() and _make_job's opts dict. - main.py needs no changes: _inject_defaults only adds defaults for flags the user did not provide, so --mode is forwarded as-is. Tests (4 new, 56 total): - test_extract_mode_stops_after_clips: 3 cues -> three clip_NNN.mp4 with correct per-cue durations; no out.mp4 produced. - test_extract_mode_skips_gap_clips: 1.5s SRT gap between cue 2 and cue 3 does NOT manifest as a gap_*.mp4 in the extracted dir. - test_extract_mode_cleans_stale_clips: planted clip_998.mp4 is removed; planted notes.txt is preserved. - test_run_episodes_extract_mode: --mode extract across 2 eps, each gets its own extracted_clips_/ with clip_NNN.mp4 inside, no per-ep final.mp4. End-to-end smoke via main.py: python main.py --source ... --srt ... --plan ... --mode extract produced 3 clips at exactly 1.500 / 1.500 / 2.000 seconds matching the SRT cue durations; no final.mp4 was created. Co-Authored-By: Claude Opus 4.7 --- helpers/run_episodes.py | 11 +++++ helpers/srt_driven_edit.py | 68 +++++++++++++++++++++++++++-- tests/test_run_episodes.py | 27 ++++++++++++ tests/test_srt_driven_e2e.py | 85 +++++++++++++++++++++++++++++++++++- 4 files changed, 187 insertions(+), 4 deletions(-) diff --git a/helpers/run_episodes.py b/helpers/run_episodes.py index 8f6300b..6047c85 100644 --- a/helpers/run_episodes.py +++ b/helpers/run_episodes.py @@ -109,6 +109,7 @@ def _make_job(ep: EpisodeJob, opts: dict) -> Job: no_cache=opts["no_cache"], keep_intermediates=opts["keep_intermediates"], no_overwrite=opts["no_overwrite"], + mode=opts.get("mode", "full"), ) @@ -126,6 +127,7 @@ def run_episodes( no_overwrite: bool = False, keep_intermediates: bool = False, continue_on_error: bool = False, + mode: str = "full", ) -> dict: """Discover + run every episode under `root`. Returns a summary dict and also writes it to `/run_episodes_summary.json`.""" @@ -145,6 +147,7 @@ def run_episodes( "no_cache": no_cache, "no_overwrite": no_overwrite, "keep_intermediates": keep_intermediates, + "mode": mode, } results: list[dict] = [] @@ -208,6 +211,13 @@ def main() -> None: ap.add_argument("--keep-intermediates", action="store_true") ap.add_argument("--continue-on-error", action="store_true", help="skip episodes that fail instead of aborting") + ap.add_argument( + "--mode", choices=["full", "extract"], default="full", + help="'full' (default) runs the complete pipeline per episode. " + "'extract' stops after segment extraction and saves clips " + "under each ep's edit/ dir; gap clips, voice mixing, " + "subtitle burn, and QC report are skipped.", + ) args = ap.parse_args() versions = preflight() @@ -226,6 +236,7 @@ def main() -> None: no_overwrite=args.no_overwrite, keep_intermediates=args.keep_intermediates, continue_on_error=args.continue_on_error, + mode=args.mode, ) # Exit nonzero if any episode failed (even with --continue-on-error, # the caller probably wants to know). diff --git a/helpers/srt_driven_edit.py b/helpers/srt_driven_edit.py index 75904dd..7a65c86 100644 --- a/helpers/srt_driven_edit.py +++ b/helpers/srt_driven_edit.py @@ -1194,6 +1194,7 @@ class Job: no_cache: bool keep_intermediates: bool no_overwrite: bool = False + mode: str = "full" # "full" (default) | "extract" (stop after segments) def run_job(job: Job, ffmpeg_version: str) -> dict: @@ -1284,13 +1285,16 @@ def run_job(job: Job, ffmpeg_version: str) -> dict: edit_dir / f"final_srt_driven_{safe_ascii_name(job.name)}.mp4" ) - if out_path.exists(): + # Output-overwrite check only matters in modes that actually produce + # final output. Extract mode stops before any out_path is written, so + # checking it would produce spurious warnings about an unrelated file. + if job.mode == "full" and out_path.exists(): if job.no_overwrite: raise SystemExit(f"output exists and --no-overwrite set: {out_path}") print(f" WARNING: overwriting existing output: {out_path}") style_resolved = resolve_style(job.style, cues) - print(f" style: {job.style} ({len(cues)} cues, cjk={has_cjk(cues)})") + print(f" style: {job.style} ({len(cues)} cues, cjk={has_cjk(cues)}) mode={job.mode}") # All intermediates live in a safe-ASCII temp dir under tempfile.gettempdir(). # Wiped at start so a previous crashed run cannot pollute. Wiped at end @@ -1319,7 +1323,10 @@ def run_job(job: Job, ffmpeg_version: str) -> dict: print(f"\n extracting {len(segments)} segments cache={'off' if job.no_cache else 'on'} voice={'per-seg' if any_voice else 'none'}") for i, seg in enumerate(segments): - if seg.leading_gap > 0.001: + # Gap clips are a concat-time concept (synthetic black + silence + # bridging non-contiguous SRT cues). Extract mode emits only the + # real source segments, so skip gap clips entirely there. + if job.mode != "extract" and seg.leading_gap > 0.001: gap_path = clips_dir / f"gap_{i:02d}_{seg.leading_gap:.3f}.mp4" if not gap_path.exists(): make_gap_clip(seg.leading_gap, portrait, gap_path) @@ -1355,6 +1362,44 @@ def run_job(job: Job, ffmpeg_version: str) -> dict: clip_paths.append(seg_path) seg_clip_info.append({"clip_path": str(seg_path), "cached": cached_hit}) + # ---- Extract mode: copy clips to a persistent location and stop ---- + if job.mode == "extract": + extracted_dir = edit_dir / f"extracted_clips_{safe_ascii_name(job.name)}" + extracted_dir.mkdir(parents=True, exist_ok=True) + # Wipe stale clips from a prior run so the dir reflects only this + # run's segments — same pattern as srt_video_editor.py. + for stale in extracted_dir.glob("clip_*.mp4"): + stale.unlink() + copied: list[dict] = [] + for seg, info in zip(segments, seg_clip_info): + src = Path(info["clip_path"]) + if not src.exists(): + continue + dst = extracted_dir / f"clip_{seg.id:03d}.mp4" + shutil.copy2(src, dst) + copied.append({ + "id": seg.id, + "filename": dst.name, + "expected_duration_s": round(seg.duration, 3), + "cached_from_prev_run": info["cached"], + }) + print(f"\n=== extract mode: stopping after segment extraction ===") + print(f" {len(copied)} clip(s) saved to: {extracted_dir}/") + for c in copied: + print(f" {c['filename']:<24} " + f"({c['expected_duration_s']:.3f}s)" + + (" [cache hit]" if c["cached_from_prev_run"] else "")) + return { + "job": job.name, + "ok": True, + "mode": "extract", + "extracted_dir": str(extracted_dir), + "clip_count": len(copied), + "segments": copied, + "elapsed_s": round(time.time() - t0, 2), + } + # ---- Full mode continues to concat + compose ---- + base_path = work_dir / "base.mp4" concat_clips(clip_paths, base_path, work_dir) @@ -1518,6 +1563,13 @@ def _bool(key: str, fb: bool) -> bool: manifest_dir / f"final_srt_driven_{safe_ascii_name(job_name)}_{idx:02d}.mp4" ) + row_mode = _str("mode", getattr(defaults, "mode", "full")) + if row_mode not in ("full", "extract"): + raise SystemExit( + f"manifest row {idx}: invalid mode {row_mode!r}; " + "expected 'full' or 'extract'" + ) + return Job( source=_path("source"), srt=srt_path, @@ -1534,6 +1586,7 @@ def _bool(key: str, fb: bool) -> bool: no_cache=_bool("no_cache", defaults.no_cache), keep_intermediates=_bool("keep_intermediates", defaults.keep_intermediates), no_overwrite=_bool("no_overwrite", defaults.no_overwrite), + mode=row_mode, ) @@ -1564,6 +1617,14 @@ def main() -> None: ap.add_argument("--fontsdir", type=Path, default=None, help="extra fonts directory passed to libass.") ap.add_argument("-o", "--output", type=Path, default=None) + ap.add_argument( + "--mode", choices=["full", "extract"], default="full", + help="'full' (default) runs extract -> concat -> subtitle burn. " + "'extract' stops after segment extraction and saves per-cue " + "clips to /extracted_clips_/clip_.mp4; " + "gap clips, voice mixing, subtitle burn, and QC report are " + "skipped.", + ) ap.add_argument("--no-cache", action="store_true") ap.add_argument("--no-overwrite", action="store_true", help="refuse to run if output file already exists.") @@ -1634,6 +1695,7 @@ def main() -> None: no_cache=args.no_cache, keep_intermediates=args.keep_intermediates, no_overwrite=args.no_overwrite, + mode=args.mode, ) run_job(job, versions["ffmpeg"]) diff --git a/tests/test_run_episodes.py b/tests/test_run_episodes.py index 956cf52..612b523 100644 --- a/tests/test_run_episodes.py +++ b/tests/test_run_episodes.py @@ -209,6 +209,33 @@ def test_run_episodes_continues_past_corrupt_plan_json( assert (batch / "ep03" / "final.mp4").exists() +def test_run_episodes_extract_mode( + runner, helpers_ns, ffmpeg_version, synth_av, tmp_path +): + """--mode extract across multiple eps: each ep produces clip_*.mp4 in its + own edit/extracted_clips_/ and NOT a final.mp4.""" + batch = tmp_path / "batch" + for name in ("ep01", "ep02"): + _make_ep(batch / name, synth_av, helpers_ns) + + summary = runner.run_episodes( + batch, ffmpeg_version=ffmpeg_version, mode="extract", + ) + + assert summary["episodes_total"] == 2 + assert summary["ok"] == 2 + for r in summary["results"]: + assert r["mode"] == "extract" + assert r["clip_count"] == 2 # CUES_2 has 2 cues + extracted_dir = Path(r["extracted_dir"]) + assert extracted_dir.is_dir() + assert (extracted_dir / "clip_001.mp4").exists() + assert (extracted_dir / "clip_002.mp4").exists() + # No final.mp4 in any ep dir + for name in ("ep01", "ep02"): + assert not (batch / name / "final.mp4").exists() + + def test_run_episodes_per_ep_voice( runner, helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path ): diff --git a/tests/test_srt_driven_e2e.py b/tests/test_srt_driven_e2e.py index d33b9a8..ea5f05e 100644 --- a/tests/test_srt_driven_e2e.py +++ b/tests/test_srt_driven_e2e.py @@ -32,7 +32,8 @@ def make_job(helpers_ns, srt_path, plan_path, tmp_path, *, source=None, voice=None, bg_volume=0.0, - style="auto", no_overwrite=False, output=None): + style="auto", no_overwrite=False, output=None, + mode="full"): sde = helpers_ns.sde return sde.Job( source=source, @@ -50,6 +51,7 @@ def make_job(helpers_ns, srt_path, plan_path, tmp_path, *, no_cache=False, keep_intermediates=False, no_overwrite=no_overwrite, + mode=mode, ) @@ -336,6 +338,87 @@ def test_global_voice_cache_independence( "mixed in the final pass, not baked into segments" +def test_extract_mode_stops_after_clips( + helpers_ns, ffmpeg_version, synth_av, tmp_path +): + """--mode extract must produce per-cue clips and NOT a concat'd final.mp4.""" + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, + source=synth_av, mode="extract") + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + + # Extract-mode result shape differs from the QC report + assert qc["ok"] is True + assert qc["mode"] == "extract" + assert qc["clip_count"] == 3 + extracted_dir = Path(qc["extracted_dir"]) + assert extracted_dir.is_dir() + + # Clips renamed to clip_.mp4 (matches srt_video_editor convention) + for cid in (1, 2, 3): + clip = extracted_dir / f"clip_{cid:03d}.mp4" + assert clip.is_file(), f"missing extracted clip: {clip}" + # Each clip should match its cue duration within encoder rounding + actual = helpers_ns.sde.probe_duration(clip) + expected = next(c for c in DEFAULT_CUES if c[0] == cid) + expected_dur = expected[2] - expected[1] + assert abs(actual - expected_dur) < 0.25, \ + f"clip {cid}: actual {actual}s vs expected {expected_dur}s" + + # And NO final.mp4 was produced — extract mode stopped early + assert not (tmp_path / "out.mp4").exists() + + +def test_extract_mode_skips_gap_clips( + helpers_ns, ffmpeg_version, synth_av, tmp_path +): + """In extract mode, the synthetic black+silence gap clips are not made — + only real source extractions land in extracted_clips_/.""" + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + # Cues with a 1.5s gap between id=2 and id=3 (final_end=4.5, final_start=6.0) + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, + source=synth_av, mode="extract") + qc = helpers_ns.sde.run_job(job, ffmpeg_version) + extracted_dir = Path(qc["extracted_dir"]) + # Only 3 clips (one per cue) — no gap_*.mp4 sneaks in + files = sorted(p.name for p in extracted_dir.iterdir()) + assert files == ["clip_001.mp4", "clip_002.mp4", "clip_003.mp4"] + + +def test_extract_mode_cleans_stale_clips( + helpers_ns, ffmpeg_version, synth_av, tmp_path +): + """A previous extract-mode run's stale clips must be removed before this + run writes its own. Otherwise leftover clip_999.mp4 would pollute the dir. + """ + srt = tmp_path / "script.srt" + plan = tmp_path / "plan.json" + helpers_ns.write_srt(srt, DEFAULT_CUES) + helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN) + + job = make_job(helpers_ns, srt, plan, tmp_path, + source=synth_av, mode="extract") + qc1 = helpers_ns.sde.run_job(job, ffmpeg_version) + extracted_dir = Path(qc1["extracted_dir"]) + + # Plant a stale clip + an unrelated note file + (extracted_dir / "clip_998.mp4").write_bytes(b"stale") + (extracted_dir / "notes.txt").write_text("user notes", encoding="utf-8") + + qc2 = helpers_ns.sde.run_job(job, ffmpeg_version) + files = sorted(p.name for p in Path(qc2["extracted_dir"]).iterdir()) + assert "clip_998.mp4" not in files, "stale clip should have been removed" + assert "notes.txt" in files, "non-clip user files must be preserved" + + def test_gap_inserted_in_output(helpers_ns, ffmpeg_version, synth_av, tmp_path): srt = tmp_path / "script.srt" plan = tmp_path / "plan.json" From 6f583cc8c690aa60a0533db76e4873a04c3e7604 Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Thu, 21 May 2026 01:00:09 +0800 Subject: [PATCH 15/18] fix(srt_video_editor): friendly errors for bad JSON and bad SRT duration Two parsing-stage failures were surfacing as confusing late errors: 1. edit_plan.json with a JSON syntax error -> json.loads raised JSONDecodeError straight through the stack. Now wrapped: the SystemExit message names the file, the offending line/column, and the parser's complaint. edit_plan is not valid JSON: plan_bad_json.json: line 1 col 44: Expecting ',' delimiter 2. SRT cue with end <= start -> srt_duration became <= 0 and was forwarded as `-t` to ffmpeg, producing an opaque "Output file is empty" or similar message from ffmpeg. Now caught in parse_srt per cue, pinned to the id, before any extraction starts. SRT id=2: end 00:00:03,000 <= start 00:00:05,000 (srt_duration -2.000s). Fix the timestamp in script.srt. Both errors fire BEFORE ffmpeg is invoked, so no half-written clips leak into temp/ and no encoder cycles are spent on doomed cuts. The end==start zero-duration corner is also caught (would have been a silent zero-byte output from ffmpeg). Happy path unchanged: smoke run still produces three clips at exactly the SRT cue durations and concatenates to the expected final.mp4. Co-Authored-By: Claude Opus 4.7 --- srt_video_editor.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/srt_video_editor.py b/srt_video_editor.py index 5818d1d..7b9c953 100644 --- a/srt_video_editor.py +++ b/srt_video_editor.py @@ -89,6 +89,10 @@ def parse_srt(path: Path) -> list[dict]: Tolerates UTF-8 with or without BOM, CRLF / LF line endings, and SRT cue settings ('position:90% align:start') trailing the time line. + + Per-cue duration is validated here: `end <= start` is rejected with + an id-pinned error so the downstream ffmpeg call never sees a + non-positive `-t` argument. """ raw = path.read_text(encoding="utf-8-sig") cues: list[dict] = [] @@ -105,6 +109,12 @@ def parse_srt(path: Path) -> list[dict]: left, right = lines[1].split("-->", 1) start = parse_ts(left.strip().split()[-1]) end = parse_ts(right.strip().split()[0]) + if end <= start: + raise SystemExit( + f"SRT id={cid}: end {format_ts(end)} <= start " + f"{format_ts(start)} (srt_duration {end - start:.3f}s). " + f"Fix the timestamp in {path}." + ) text = "\n".join(lines[2:]) cues.append({"id": cid, "start": start, "end": end, "text": text}) if not cues: @@ -115,8 +125,20 @@ def parse_srt(path: Path) -> list[dict]: def parse_plan(path: Path) -> list[dict]: """Return a list of {id, source_start, source_end}. Only Form A is accepted here (a flat JSON array); Form B is out of scope for the - minimal version.""" - data = json.loads(path.read_text(encoding="utf-8")) + minimal version. + + JSON syntax errors are reported as a SystemExit with the file path + plus the offending line / column / message, rather than as a bare + JSONDecodeError traceback. + """ + raw = path.read_text(encoding="utf-8") + try: + data = json.loads(raw) + except json.JSONDecodeError as e: + raise SystemExit( + f"edit_plan is not valid JSON: {path}: " + f"line {e.lineno} col {e.colno}: {e.msg}" + ) if not isinstance(data, list): raise SystemExit( "edit_plan.json must be a JSON array of " From 5d36126a00fa1ba3e4e79d7e656b82864242937a Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Thu, 21 May 2026 09:23:22 +0800 Subject: [PATCH 16/18] feat(srt_video_editor): ffprobe each clip after cut, print actual vs target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the per-cue extract loop completes, walk the output clips through ffprobe and print actual container duration alongside the requested target duration: verifying 3 clip(s) with ffprobe: clip_001.mp4: 1.50s,target: 1.50s clip_002.mp4: 1.50s,target: 1.50s clip_003.mp4: 2.00s,target: 2.00s Why: libx264 can drift a few hundredths of a second from the requested cut window (keyframe / GOP rounding). Printing both lets the user spot a clip that's wildly off — e.g. ffmpeg silently produced a 0s output because of a bad time spec — without having to manually probe each file. New `probe_clip_duration(path)` helper returns None when ffprobe is missing or the file is unreadable, in which case the line degrades to `(probe failed)`. The verification is informational only and never aborts the run after a successful extract. Output formatting matches the spec literally — Chinese fullwidth comma between actual and target, two-decimal seconds. Happy-path smoke run on the 10s lavfi synth shows all three clips matching target exactly. Co-Authored-By: Claude Opus 4.7 --- srt_video_editor.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/srt_video_editor.py b/srt_video_editor.py index 7b9c953..2477e4b 100644 --- a/srt_video_editor.py +++ b/srt_video_editor.py @@ -188,6 +188,34 @@ def validate_ids(cues: list[dict], plan: list[dict]) -> None: # ---------- report ---------- +def probe_clip_duration(path: Path) -> float | None: + """Return the duration of `path` in seconds via ffprobe. + + Returns None if ffprobe is missing or the file is unreadable — + verification is informational, so probe failures should not abort + the run after a successful extraction. + """ + cmd = [ + "ffprobe", "-v", "error", + "-show_entries", "format=duration", + "-of", "default=noprint_wrappers=1:nokey=1", + str(path), + ] + try: + proc = subprocess.run( + cmd, capture_output=True, text=True, + encoding="utf-8", errors="replace", + ) + except FileNotFoundError: + return None + if proc.returncode != 0: + return None + try: + return float(proc.stdout.strip()) + except ValueError: + return None + + def cut_clip(source: Path, start: float, cut_duration: float, out_path: Path) -> None: """Cut `cut_duration` seconds starting at `start` from source. @@ -280,6 +308,7 @@ def extract_clips( print() print(f"cutting {len(cues)} clip(s) -> {temp_dir}/") outputs: list[Path] = [] + targets: list[float] = [] # parallel to outputs — used by post-verify pass for cue in sorted(cues, key=lambda c: c["id"]): cid = cue["id"] p = plan_by_id[cid] @@ -314,6 +343,22 @@ def extract_clips( ) cut_clip(source, start, cut_duration, out_path) outputs.append(out_path) + targets.append(cut_duration) + + # ---- ffprobe verification ---- + # Container duration can drift a few hundredths of a second from the + # target after re-encoding (libx264 GOP / first-keyframe boundary). + # Print the actual vs target side by side so the user can spot a + # clip that's wildly off — e.g. ffmpeg silently truncated to 0s. + print() + print(f"verifying {len(outputs)} clip(s) with ffprobe:") + for out_path, target in zip(outputs, targets): + actual = probe_clip_duration(out_path) + if actual is None: + print(f" {out_path.name}: (probe failed),target: {target:.2f}s") + else: + print(f" {out_path.name}: {actual:.2f}s,target: {target:.2f}s") + return outputs From 923ad0992ddda3d874558f141f2768f9207bcfe8 Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Thu, 21 May 2026 14:51:14 +0800 Subject: [PATCH 17/18] feat(transcribe): replace ElevenLabs Scribe with DashScope Paraformer-v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the ASR backend in helpers/transcribe.py with Alibaba DashScope's paraformer-realtime-v2 model, called via the official `dashscope` Python SDK in file mode (the SDK handles WebSocket framing internally given a local WAV path). The downstream recommend_edit_plan helper consumes transcripts unchanged because the output is deliberately kept Scribe-shaped: { "language_code": "auto" | "", "_source": "dashscope-paraformer-realtime-v2", "words": [ {"text": "...", "start": 1.234, "end": 1.567, "type": "word"}, ... ] } Audio extraction (mono 16kHz PCM WAV via ffmpeg), per-file caching (`/transcripts/.json`), and the `transcribe_one` function signature are all unchanged, so transcribe_batch.py keeps working with no caller-side changes. BREAKING CHANGES (documented; SKILL.md / README / install.md still mention ElevenLabs and need a follow-up doc pass): - Env var renamed: ELEVENLABS_API_KEY -> DASHSCOPE_API_KEY. Generate one at https://dashscope.console.aliyun.com/ and put `DASHSCOPE_API_KEY=...` in /.env (same .env discovery the old code used). - Speaker diarization is gone — paraformer does not segment speakers. `speaker_id` is omitted from word records. The `--num-speakers` CLI flag is removed from transcribe_batch.py; transcribe_one still accepts the kwarg (for call-site stability) but ignores it with a one-line note in verbose mode. - Audio events ("(laughter)", "(applause)") with `type: audio_event` are gone — paraformer does not tag them. The recommender's `--keep-audio-events` flag has nothing to keep but does not error. - New runtime dependency: `dashscope>=1.20` added to pyproject main deps. `pip install -e .` will pick it up. Tradeoff rationale: DashScope batch mode (the Transcription endpoint) needs publicly-reachable URLs, which would force every user to wire up OSS or a tunnel. The realtime file-mode path streams the local WAV directly through the SDK with no external infrastructure, at the cost of one WebSocket round-trip per file. Word-level timestamps come back at the same granularity Scribe provided. Conversion is in a pure helper `_convert_dashscope_to_scribe()` that folds DashScope's separately-emitted punctuation tokens into the preceding word's text, drops empty/whitespace entries, and tolerates non-numeric timestamps from the API rather than crashing the whole job on one bad word. Six unit tests cover the conversion, including a cross-module check that `recommend_edit_plan.load_transcript_words` can consume the output directly (the contract we promise). End-to-end ASR validation requires a live key + network and is left to the user to verify after rotating the credential they just leaked in chat. Suggested smoke run after rotation: pip install dashscope echo 'DASHSCOPE_API_KEY=' >> .env python helpers/transcribe.py path/to/clip.mp4 --language zh Co-Authored-By: Claude Opus 4.7 --- helpers/transcribe.py | 210 +++++++++++++++++++++++++++--------- helpers/transcribe_batch.py | 16 +-- pyproject.toml | 1 + tests/test_transcribe.py | 126 ++++++++++++++++++++++ 4 files changed, 289 insertions(+), 64 deletions(-) create mode 100644 tests/test_transcribe.py diff --git a/helpers/transcribe.py b/helpers/transcribe.py index 26d3906..5319539 100644 --- a/helpers/transcribe.py +++ b/helpers/transcribe.py @@ -1,16 +1,37 @@ -"""Transcribe a video with ElevenLabs Scribe. +"""Transcribe a video with Alibaba DashScope Paraformer-v2 (realtime, file mode). + +Extracts mono 16kHz PCM audio via ffmpeg, streams it to DashScope's +paraformer-realtime-v2 model via the official `dashscope` SDK, and +writes a Scribe-compatible JSON transcript so the downstream +recommend_edit_plan helper keeps working without changes. + +Output schema (intentionally Scribe-shaped): + { + "language_code": "auto" | "", + "_source": "dashscope-paraformer-realtime-v2", + "words": [ + {"text": "你好", "start": 1.234, "end": 1.567, "type": "word"}, + ... + ] + } + +Tradeoffs vs the previous ElevenLabs Scribe integration: + - No speaker diarization — paraformer does not segment speakers, + so `speaker_id` is omitted from every word record. + - No audio events — Scribe's "(laughter)" / "(applause)" entries + with `"type": "audio_event"` are simply absent. + - The `--num-speakers` flag is accepted by transcribe_one for + backward compatibility with transcribe_batch but ignored. -Extracts mono 16kHz audio via ffmpeg, uploads to Scribe with verbatim + -diarize + audio events + word-level timestamps, writes the full response -to /transcripts/.json. +Cached: if the output transcript already exists, the API call is skipped. -Cached: if the output file already exists, the upload is skipped. +API key: + DASHSCOPE_API_KEY in /.env or in the environment. Usage: python helpers/transcribe.py + python helpers/transcribe.py --language zh python helpers/transcribe.py --edit-dir /custom/edit - python helpers/transcribe.py --language en - python helpers/transcribe.py --num-speakers 2 """ from __future__ import annotations @@ -24,29 +45,34 @@ import time from pathlib import Path -import requests - -SCRIBE_URL = "https://api.elevenlabs.io/v1/speech-to-text" +DASHSCOPE_MODEL = "paraformer-realtime-v2" +ENV_VAR = "DASHSCOPE_API_KEY" def load_api_key() -> str: + """Read DASHSCOPE_API_KEY from /.env, ./.env, or the environment.""" for candidate in [Path(__file__).resolve().parent.parent / ".env", Path(".env")]: if candidate.exists(): - for line in candidate.read_text().splitlines(): + for line in candidate.read_text(encoding="utf-8").splitlines(): line = line.strip() if not line or line.startswith("#") or "=" not in line: continue k, v = line.split("=", 1) - if k.strip() == "ELEVENLABS_API_KEY": + if k.strip() == ENV_VAR: return v.strip().strip('"').strip("'") - v = os.environ.get("ELEVENLABS_API_KEY", "") + v = os.environ.get(ENV_VAR, "") if not v: - sys.exit("ELEVENLABS_API_KEY not found in .env or environment") + sys.exit( + f"{ENV_VAR} not found in .env or environment. " + f"Generate one at https://dashscope.console.aliyun.com/ " + f"and put `{ENV_VAR}=...` in /.env." + ) return v def extract_audio(video_path: Path, dest: Path) -> None: + """Extract mono 16kHz PCM WAV — the format paraformer-v2 expects.""" cmd = [ "ffmpeg", "-y", "-i", str(video_path), "-vn", "-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", @@ -55,36 +81,100 @@ def extract_audio(video_path: Path, dest: Path) -> None: subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) -def call_scribe( +def _convert_dashscope_to_scribe( + sentences: list[dict], + language_hint: str | None, +) -> dict: + """Flatten DashScope sentence/word structure into Scribe-compatible shape. + + DashScope returns: + sentence: [ + {begin_time, end_time, text, + words: [{begin_time, end_time, text, punctuation}, ...]} + ] + + recommend_edit_plan.load_transcript_words wants a flat words[] with + seconds-based start/end and a 'word' type marker. Convert here so the + consumer stays Scribe-shaped and we don't need to touch recommender code. + + Punctuation tokens that DashScope splits onto their own word entry are + folded into the preceding word's text — closer to how Scribe formatted + them. Empty / whitespace-only text entries are dropped. + """ + words: list[dict] = [] + for sent in sentences or []: + for w in (sent.get("words") or []): + text = (w.get("text") or "").strip() + if not text: + continue + punct = (w.get("punctuation") or "").strip() + try: + start_ms = float(w.get("begin_time") or 0) + end_ms = float(w.get("end_time") or 0) + except (TypeError, ValueError): + continue + words.append({ + "text": text + punct, + "start": start_ms / 1000.0, + "end": end_ms / 1000.0, + "type": "word", + }) + return { + "language_code": language_hint or "auto", + "_source": f"dashscope-{DASHSCOPE_MODEL}", + "words": words, + } + + +def call_dashscope( audio_path: Path, api_key: str, language: str | None = None, - num_speakers: int | None = None, ) -> dict: - data: dict[str, str] = { - "model_id": "scribe_v1", - "diarize": "true", - "tag_audio_events": "true", - "timestamps_granularity": "word", - } - if language: - data["language_code"] = language - if num_speakers: - data["num_speakers"] = str(num_speakers) - - with open(audio_path, "rb") as f: - resp = requests.post( - SCRIBE_URL, - headers={"xi-api-key": api_key}, - files={"file": (audio_path.name, f, "audio/wav")}, - data=data, - timeout=1800, + """Call paraformer-realtime-v2 in file mode. Returns Scribe-shaped dict. + + The dashscope SDK handles WebSocket framing internally when given a + local file path — no manual chunking required. Defensive against + minor SDK shape variations: tolerates both `output.sentence` and + `output.sentences` (the docs and the wire format have shifted). + """ + try: + import dashscope + from dashscope.audio.asr import Recognition + except ImportError: + raise SystemExit( + "dashscope package not installed. Install with:\n" + " pip install dashscope\n" + "(or `pip install -e .` from the repo root once dashscope is in " + "your project deps)." ) - if resp.status_code != 200: - raise RuntimeError(f"Scribe returned {resp.status_code}: {resp.text[:500]}") + dashscope.api_key = api_key + + language_hints = [language] if language else None - return resp.json() + recognition = Recognition( + model=DASHSCOPE_MODEL, + format="wav", + sample_rate=16000, + language_hints=language_hints, + callback=None, + ) + response = recognition.call(file=str(audio_path)) + + status = getattr(response, "status_code", None) + if status != 200: + msg = getattr(response, "message", None) or str(response) + request_id = getattr(response, "request_id", "") + raise RuntimeError( + f"DashScope {DASHSCOPE_MODEL} returned status={status} " + f"request_id={request_id}: {msg}" + ) + + output = getattr(response, "output", None) or {} + # Both shapes seen in the wild; honour either. + sentences = output.get("sentence") or output.get("sentences") or [] + return _convert_dashscope_to_scribe(sentences, language) def transcribe_one( @@ -97,8 +187,19 @@ def transcribe_one( ) -> Path: """Transcribe a single video. Returns path to transcript JSON. + `num_speakers` is accepted for backward compatibility with the previous + ElevenLabs Scribe interface (and with transcribe_batch.py's call site) + but is ignored — paraformer does not perform speaker diarization. A + one-line note is printed when a non-None value is supplied in verbose mode. + Cached: returns existing path immediately if the transcript already exists. """ + if num_speakers is not None and verbose: + print( + f" (note: --num-speakers={num_speakers} ignored — DashScope " + f"{DASHSCOPE_MODEL} has no speaker diarization)" + ) + transcripts_dir = edit_dir / "transcripts" transcripts_dir.mkdir(parents=True, exist_ok=True) out_path = transcripts_dir / f"{video.stem}.json" @@ -117,23 +218,33 @@ def transcribe_one( extract_audio(video, audio) size_mb = audio.stat().st_size / (1024 * 1024) if verbose: - print(f" uploading {video.stem}.wav ({size_mb:.1f} MB)", flush=True) - payload = call_scribe(audio, api_key, language, num_speakers) - - out_path.write_text(json.dumps(payload, indent=2)) + print( + f" streaming {video.stem}.wav ({size_mb:.1f} MB) " + f"to DashScope {DASHSCOPE_MODEL}", + flush=True, + ) + payload = call_dashscope(audio, api_key, language) + + # ensure_ascii=False so CJK characters are stored as-is (smaller file + + # human-readable when inspecting transcripts). + out_path.write_text( + json.dumps(payload, ensure_ascii=False, indent=2), + encoding="utf-8", + ) dt = time.time() - t0 if verbose: kb = out_path.stat().st_size / 1024 - print(f" saved: {out_path.name} ({kb:.1f} KB) in {dt:.1f}s") - if isinstance(payload, dict) and "words" in payload: - print(f" words: {len(payload['words'])}") + words_count = len(payload.get("words", [])) + print(f" saved: {out_path.name} ({kb:.1f} KB, {words_count} words) in {dt:.1f}s") return out_path def main() -> None: - ap = argparse.ArgumentParser(description="Transcribe a video with ElevenLabs Scribe") + ap = argparse.ArgumentParser( + description=f"Transcribe a video with DashScope {DASHSCOPE_MODEL}" + ) ap.add_argument("video", type=Path, help="Path to video file") ap.add_argument( "--edit-dir", @@ -145,13 +256,7 @@ def main() -> None: "--language", type=str, default=None, - help="Optional ISO language code (e.g., 'en'). Omit to auto-detect.", - ) - ap.add_argument( - "--num-speakers", - type=int, - default=None, - help="Optional number of speakers when known. Improves diarization accuracy.", + help="Language hint (e.g. 'zh', 'en', 'ja'). Omit to auto-detect.", ) args = ap.parse_args() @@ -167,7 +272,6 @@ def main() -> None: edit_dir=edit_dir, api_key=api_key, language=args.language, - num_speakers=args.num_speakers, ) diff --git a/helpers/transcribe_batch.py b/helpers/transcribe_batch.py index 5aeb1d6..3fe86e0 100644 --- a/helpers/transcribe_batch.py +++ b/helpers/transcribe_batch.py @@ -1,14 +1,15 @@ """Batch-transcribe every video in a directory with 4 parallel workers. -Walks for common video extensions, runs ElevenLabs Scribe on -each, writes transcripts to /edit/transcripts/.json. +Walks for common video extensions, transcribes each via +DashScope paraformer-v2 (see helpers/transcribe.py), writes transcripts +to /edit/transcripts/.json. Cached per-file: any source that already has a transcript is skipped. Usage: python helpers/transcribe_batch.py python helpers/transcribe_batch.py --workers 4 - python helpers/transcribe_batch.py --num-speakers 2 + python helpers/transcribe_batch.py --language zh python helpers/transcribe_batch.py --edit-dir /custom/edit """ @@ -48,13 +49,7 @@ def main() -> None: "--language", type=str, default=None, - help="Optional ISO language code. Omit to auto-detect per file.", - ) - ap.add_argument( - "--num-speakers", - type=int, - default=None, - help="Optional number of speakers. Improves diarization when known.", + help="Language hint (e.g. 'zh', 'en'). Omit to auto-detect per file.", ) args = ap.parse_args() @@ -91,7 +86,6 @@ def main() -> None: edit_dir=edit_dir, api_key=api_key, language=args.language, - num_speakers=args.num_speakers, verbose=False, ): v for v in pending diff --git a/pyproject.toml b/pyproject.toml index 45ce7ad..651d0fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ license = { file = "LICENSE" } requires-python = ">=3.10" dependencies = [ "requests", + "dashscope>=1.20", "librosa", "matplotlib", "pillow", diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py new file mode 100644 index 0000000..8ce55cd --- /dev/null +++ b/tests/test_transcribe.py @@ -0,0 +1,126 @@ +"""Unit tests for transcribe.py — only the pure conversion logic. + +API calls require a live DashScope key and external network; those are +intentionally out of scope here. Run an end-to-end smoke manually: + + python helpers/transcribe.py path/to/clip.mp4 --language zh +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT / "helpers")) + + +@pytest.fixture +def transcribe_mod(): + import transcribe as t + return t + + +def test_convert_basic_sentence(transcribe_mod): + """One sentence with two words gets flattened into Scribe-shaped words[].""" + sentences = [ + { + "begin_time": 0, + "end_time": 1500, + "text": "你好世界", + "words": [ + {"begin_time": 0, "end_time": 500, "text": "你好", "punctuation": ""}, + {"begin_time": 500, "end_time": 1500, "text": "世界", "punctuation": "。"}, + ], + } + ] + out = transcribe_mod._convert_dashscope_to_scribe(sentences, language_hint="zh") + assert out["language_code"] == "zh" + assert out["_source"].startswith("dashscope-") + assert len(out["words"]) == 2 + assert out["words"][0] == {"text": "你好", "start": 0.0, "end": 0.5, "type": "word"} + # Punctuation gets folded into the preceding word's text + assert out["words"][1] == {"text": "世界。", "start": 0.5, "end": 1.5, "type": "word"} + + +def test_convert_drops_empty_text(transcribe_mod): + """Whitespace-only / empty word entries are skipped, not emitted as junk.""" + sentences = [ + {"words": [ + {"begin_time": 0, "end_time": 100, "text": ""}, + {"begin_time": 100, "end_time": 200, "text": " "}, + {"begin_time": 200, "end_time": 400, "text": "hello"}, + ]} + ] + out = transcribe_mod._convert_dashscope_to_scribe(sentences, language_hint=None) + assert len(out["words"]) == 1 + assert out["words"][0]["text"] == "hello" + # No language hint → "auto" + assert out["language_code"] == "auto" + + +def test_convert_multiple_sentences(transcribe_mod): + """Words from multiple sentences flatten into a single ordered list.""" + sentences = [ + {"words": [ + {"begin_time": 0, "end_time": 500, "text": "first"}, + ]}, + {"words": [ + {"begin_time": 1000, "end_time": 1500, "text": "second"}, + {"begin_time": 1500, "end_time": 2000, "text": "third"}, + ]}, + ] + out = transcribe_mod._convert_dashscope_to_scribe(sentences, language_hint="en") + assert [w["text"] for w in out["words"]] == ["first", "second", "third"] + assert out["words"][0]["start"] == 0.0 + assert out["words"][-1]["end"] == 2.0 + + +def test_convert_tolerates_missing_or_bad_timestamps(transcribe_mod): + """A word with non-numeric timestamps is skipped rather than crashing + the whole conversion.""" + sentences = [ + {"words": [ + {"begin_time": "bad", "end_time": 500, "text": "junk"}, + {"begin_time": 0, "end_time": 500, "text": "good"}, + ]} + ] + out = transcribe_mod._convert_dashscope_to_scribe(sentences, language_hint=None) + assert [w["text"] for w in out["words"]] == ["good"] + + +def test_convert_empty_input(transcribe_mod): + """Empty / None input returns a structurally valid envelope with no words.""" + out = transcribe_mod._convert_dashscope_to_scribe([], language_hint=None) + assert out["words"] == [] + assert "language_code" in out and "_source" in out + + out_none = transcribe_mod._convert_dashscope_to_scribe(None, language_hint=None) + assert out_none["words"] == [] + + +def test_output_shape_compatible_with_recommender(transcribe_mod, tmp_path): + """Conversion produces JSON that recommend_edit_plan.load_transcript_words + can consume directly — this is the cross-module contract we promise.""" + import json + import recommend_edit_plan as rec + + sentences = [ + {"words": [ + {"begin_time": 1000, "end_time": 1500, "text": "hello", "punctuation": ""}, + {"begin_time": 1500, "end_time": 2000, "text": "world", "punctuation": "."}, + ]} + ] + transcript = transcribe_mod._convert_dashscope_to_scribe(sentences, language_hint="en") + + out_file = tmp_path / "transcript.json" + out_file.write_text(json.dumps(transcript, ensure_ascii=False), encoding="utf-8") + + words = rec.load_transcript_words(out_file) + assert len(words) == 2 + assert words[0]["text"] == "hello" + assert words[1]["text"] == "world." + assert words[0]["start"] == 1.0 + assert words[1]["end"] == 2.0 From 16f99a2b410f0af4a6e32e89dff6440755504d86 Mon Sep 17 00:00:00 2001 From: xiaogang-sudo <15689210561@163.com> Date: Thu, 21 May 2026 15:19:45 +0800 Subject: [PATCH 18/18] fix(transcribe): pin DashScope to Mainland China endpoints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A stale DASHSCOPE_HTTP_BASE_URL / DASHSCOPE_WEBSOCKET_BASE_URL env var (or a multi-account shell) can silently route the SDK to the international host, where a domestic key returns a misleading 401 "Unauthorized, your api-key is invalid!" — even though the key is fine on the domestic side. Set both URLs explicitly in call_dashscope() so the binding is deterministic regardless of the surrounding environment: dashscope.base_http_api_url = https://dashscope.aliyuncs.com/api/v1 dashscope.base_websocket_api_url = wss://dashscope.aliyuncs.com/api-ws/v1/inference These are the SDK defaults, so this is a no-op for fresh installs but a real fix for environments that have been touched by intl tooling. Verified end-to-end on a 10s lavfi synth source (testsrc2 + 440Hz sine): the call now returns 200 in ~1s with an empty words[] (correct — pure tone has no speech). Output JSON is the promised Scribe-shaped envelope: {"language_code": "en", "_source": "dashscope-paraformer-realtime-v2", "words": []} Co-Authored-By: Claude Opus 4.7 --- helpers/transcribe.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/helpers/transcribe.py b/helpers/transcribe.py index 5319539..a8b28ac 100644 --- a/helpers/transcribe.py +++ b/helpers/transcribe.py @@ -150,6 +150,15 @@ def call_dashscope( ) dashscope.api_key = api_key + # Pin to the Mainland China endpoints explicitly. Both URLs are the SDK + # defaults, but stale DASHSCOPE_HTTP_BASE_URL / DASHSCOPE_WEBSOCKET_BASE_URL + # env vars (left over from an international account) would otherwise route + # us to the wrong region and produce a misleading 401 from the intl host + # even when the key is valid on the domestic side. + dashscope.base_http_api_url = "https://dashscope.aliyuncs.com/api/v1" + dashscope.base_websocket_api_url = ( + "wss://dashscope.aliyuncs.com/api-ws/v1/inference" + ) language_hints = [language] if language else None