diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..7715dcb
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,23 @@
+# Agent Review Instructions
+
+You are reviewing a Python + ffmpeg video editing tool.
+
+Main goal:
+Build a reliable SRT-driven video editor for Chinese drama recap videos.
+
+Please focus on:
+- code structure
+- ffmpeg stability
+- SRT parsing correctness
+- JSON validation
+- Windows path compatibility
+- Chinese subtitle rendering
+- error handling
+- extensibility
+
+Do not rewrite the entire project unless necessary.
+Prefer small, safe patches.
+Classify suggestions into:
+1. Must fix
+2. Should improve
+3. Later
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..61b1e4c
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,33 @@
+# srt_video_editor 项目说明
+
+本项目是电视剧解说自动剪辑工具。
+
+核心目标：
+根据 script.srt 和 edit_plan.json，从 source.mp4 中截取画面，拼接、加配音、烧字幕，输出 final.mp4。
+
+当前阶段：
+只做 SRT 驱动剪辑，不做 AI 自动理解剧情。
+
+技术要求：
+- Python 3.10+
+- ffmpeg
+- Windows 优先
+- 路径尽量使用英文
+- 不使用 moviepy，优先直接调用 ffmpeg
+- 输出日志要清楚
+- 不要引入复杂前端
+
+核心输入：
+- input/source.mp4
+- input/script.srt
+- input/edit_plan.json
+- input/voice.wav
+
+核心输出：
+- output/final.mp4
+
+禁止事项：
+- 不要破解剪映
+- 不要调用未授权接口
+- 不要一次性做复杂 AI 自动分析
+- 不要改动 input 原始文件
diff --git a/examples/srt_driven/_smoke_test.py b/examples/srt_driven/_smoke_test.py
new file mode 100644
index 0000000..88053e2
--- /dev/null
+++ b/examples/srt_driven/_smoke_test.py
@@ -0,0 +1,421 @@
+"""Regression tests for srt_driven_edit. Run with bare `python` — no pytest."""
+
+import sys
+import tempfile
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "helpers"))
+
+from srt_driven_edit import (
+    parse_srt, parse_plan, align, validate_srt, validate_plan,
+    validate_alignment, resolve_style, has_cjk, STYLE_TEMPLATES,
+    subs_filter_escape, safe_ascii_name,
+    concat_quote_path, read_srt_text, make_safe_work_dir,
+    _split_time_line, V_SYNC_TAIL, A_SYNC_TAIL, SRT_ENCODINGS,
+    ensure_safe_subs_path, _path_is_filter_safe,
+    PARAMS_FINGERPRINT, CACHE_VERSION, cache_key,
+    Segment,
+)
+
+base = Path(__file__).resolve().parent
+
+
+def section(title: str) -> None:
+    print(f"\n=== {title} ===")
+
+
+def ok(msg: str) -> None:
+    print(f"  ok: {msg}")
+
+
+def fail(msg: str) -> None:
+    raise SystemExit(f"  FAIL: {msg}")
+
+
+# -- 1. Legacy Form A + Form B parsing -----------------------------------
+
+section("Form A (legacy array, English)")
+cues = parse_srt(base / "script.srt")
+validate_srt(cues)
+sources, voices, entries = parse_plan(base / "edit_plan.json")
+assert len(cues) == 3 and len(entries) == 3 and sources == {} and voices == {}
+ok("parsed 3 cues + 3 plan entries, no sources/voices map")
+assert not has_cjk(cues)
+ok("has_cjk False on English")
+
+section("Form B (object, multi-source, multi-voice)")
+sources, voices, entries = parse_plan(base / "edit_plan_v2.json")
+assert list(sources) == ["A", "B"] and list(voices) == ["host", "guest"]
+ok(f"sources={list(sources)} voices={list(voices)}")
+assert entries[0].source_name == "A" and entries[0].voice_name == "host"
+assert entries[1].source_name == "B" and entries[1].voice_name == "guest"
+assert entries[2].source_name == "A" and entries[2].voice_name is None
+ok("per-segment source/voice refs parsed")
+
+
+# -- 2. CJK detection + auto style + style templates ---------------------
+
+section("CJK detection + style resolution")
+cues_cjk = parse_srt(base / "script_cjk.srt")
+assert has_cjk(cues_cjk) is True
+assert not has_cjk(cues)
+ok("CJK regex matches CN/EN correctly")
+auto_cjk = resolve_style("auto", cues_cjk)
+auto_en = resolve_style("auto", cues)
+assert "Microsoft YaHei UI" in auto_cjk
+assert "Helvetica" in auto_en
+ok("auto style picks YaHei for CJK, Helvetica for EN")
+assert STYLE_TEMPLATES["cjk-natural"] == resolve_style("cjk-natural", cues)
+ok("named template lookup")
+raw = "FontName=Custom,FontSize=24"
+assert resolve_style(raw, cues) == raw
+ok("raw ASS string passthrough")
+
+
+# -- 3. SRT encoding fallback (GBK / utf-8-sig / utf-8) ------------------
+
+section("read_srt_text encoding fallback")
+tmp = Path(tempfile.mkdtemp(prefix="srt_smoke_"))
+
+cjk_payload = "1\n00:00:00,000 --> 00:00:03,000\n中文字幕测试\n"
+
+# utf-8
+(tmp / "u8.srt").write_bytes(cjk_payload.encode("utf-8"))
+text = read_srt_text(tmp / "u8.srt")
+assert "中文字幕测试" in text, f"utf-8 decode wrong: {text!r}"
+ok("utf-8 decoded")
+
+# utf-8 with BOM
+(tmp / "u8bom.srt").write_bytes(b"\xef\xbb\xbf" + cjk_payload.encode("utf-8"))
+text = read_srt_text(tmp / "u8bom.srt")
+assert text.startswith("1") and "中文" in text, f"utf-8-sig decode wrong: {text!r}"
+ok("utf-8-sig BOM stripped + decoded")
+
+# gb18030 (typical Windows Chinese)
+(tmp / "gb.srt").write_bytes(cjk_payload.encode("gb18030"))
+text = read_srt_text(tmp / "gb.srt")
+assert "中文字幕测试" in text, f"gb18030 decode wrong: {text!r}"
+ok("gb18030 decoded via fallback")
+
+# cp936 (a.k.a. GBK, Windows Chinese ANSI)
+(tmp / "cp936.srt").write_bytes(cjk_payload.encode("cp936"))
+text = read_srt_text(tmp / "cp936.srt")
+assert "中文字幕测试" in text
+ok("cp936 decoded via fallback")
+
+# Now parse a GBK-encoded full SRT end-to-end
+gbk_full = (
+    "1\n00:00:00,000 --> 00:00:03,000\n这是第一条\n\n"
+    "2\n00:00:03,000 --> 00:00:06,000\n这是第二条\n"
+)
+gbk_path = tmp / "full_gbk.srt"
+gbk_path.write_bytes(gbk_full.encode("gb18030"))
+cues_gbk = parse_srt(gbk_path)
+assert len(cues_gbk) == 2
+assert cues_gbk[0].text == "这是第一条"
+assert cues_gbk[1].text == "这是第二条"
+ok("parse_srt end-to-end on GB18030 input")
+
+
+# -- 4. SRT cue settings tolerance ---------------------------------------
+
+section("Cue settings on time line")
+# Real-world examples: 'position:90% align:start' on the right of -->
+samples = [
+    ("00:00:00,000 --> 00:00:03,000 position:90%", (0.0, 3.0)),
+    ("00:00:01,500 --> 00:00:04,200 align:start line:80%", (1.5, 4.2)),
+    ("  00:00:02,000   -->   00:00:05,000   X1:10 X2:200 Y1:5 Y2:50", (2.0, 5.0)),
+    ("00:00:00.500 --> 00:00:01.000", (0.5, 1.0)),  # dot fraction
+]
+for line, expected in samples:
+    a, b = _split_time_line(line)
+    from srt_driven_edit import parse_timestamp
+    got = (parse_timestamp(a), parse_timestamp(b))
+    assert abs(got[0] - expected[0]) < 1e-6 and abs(got[1] - expected[1]) < 1e-6, \
+        f"{line!r} → {got}, expected {expected}"
+ok(f"parsed {len(samples)} time lines with cue settings / odd spacing")
+
+# Full SRT with cue settings inline
+weird_srt = (
+    "1\n00:00:00,000 --> 00:00:03,000 position:90% align:start\nhello\n\n"
+    "2\n00:00:03,000 --> 00:00:07,000 line:80%\nworld\n"
+)
+weird = tmp / "weird.srt"
+weird.write_text(weird_srt, encoding="utf-8")
+parsed = parse_srt(weird)
+assert len(parsed) == 2
+assert parsed[0].final_start == 0.0 and parsed[0].final_end == 3.0
+assert parsed[0].text == "hello" and parsed[1].text == "world"
+ok("parse_srt tolerates cue settings end-to-end")
+
+
+# -- 5. concat_quote_path edge cases -------------------------------------
+
+section("concat_quote_path edge cases")
+cases = [
+    (Path("/tmp/foo.mp4"),                "'/tmp/foo.mp4'"),
+    (Path("/tmp/foo bar.mp4"),            "'/tmp/foo bar.mp4'"),
+    (Path("/tmp/it's.mp4"),               "'/tmp/it'\\''s.mp4'"),
+    (Path("/tmp/he said 'hi'.mp4"),       "'/tmp/he said '\\''hi'\\''.mp4'"),
+]
+for p, _expected in cases:
+    got = concat_quote_path(p)
+    # We only check the structural pattern: start/end with single quote,
+    # any embedded single-quotes are properly close-escape-reopened.
+    assert got.startswith("'") and got.endswith("'"), f"{p}: {got}"
+    # Verify reverse — closing+escape+reopen idiom for any input apostrophe
+    if "'" in p.as_posix():
+        assert "'\\''" in got, f"{p}: {got}"
+    ok(f"{p.as_posix()!r:<35} → {got}")
+
+# CJK paths — verify it doesn't barf and produces a quoted UTF-8 string
+# Note: concat_quote_path calls .resolve() which prepends a drive letter on
+# Windows, so compare against the resolved posix form, not the literal input.
+cjk_p = Path("/tmp/视频 v2/片段.mp4")
+got = concat_quote_path(cjk_p)
+assert got == f"'{cjk_p.resolve().as_posix()}'"
+assert "视频" in got and "片段" in got
+ok(f"CJK + space preserved: {got}")
+
+
+# -- 6. make_safe_work_dir produces ASCII path ---------------------------
+
+section("make_safe_work_dir")
+plan_with_cjk_path = tmp / "中文 plan.json"
+plan_with_cjk_path.write_text("[]", encoding="utf-8")
+wd = make_safe_work_dir("我的剪辑 v2!", plan_with_cjk_path)
+assert wd.exists() and wd.is_dir()
+# Path must be ASCII-only (no CJK leaks)
+assert all(ord(c) < 128 for c in str(wd)), f"work dir not ASCII: {wd}"
+assert "srt_edit_" in wd.name
+ok(f"work dir is ASCII: {wd}")
+
+# Re-creating wipes previous contents (deterministic)
+sentinel = wd / "_stale.txt"
+sentinel.write_text("old")
+wd2 = make_safe_work_dir("我的剪辑 v2!", plan_with_cjk_path)
+assert wd2 == wd
+assert not sentinel.exists()
+ok("rerun wipes stale contents")
+
+
+# -- 7. Sync tails defined and reasonable --------------------------------
+
+section("Sync tail constants")
+assert "fps=24" in V_SYNC_TAIL and "setpts=PTS-STARTPTS" in V_SYNC_TAIL
+assert "aresample=async=1" in A_SYNC_TAIL and "asetpts=PTS-STARTPTS" in A_SYNC_TAIL
+ok(f"V_SYNC_TAIL = {V_SYNC_TAIL}")
+ok(f"A_SYNC_TAIL = {A_SYNC_TAIL}")
+
+
+# -- 8. Strict validation -------------------------------------------------
+
+section("Validation errors hard-fail")
+import json as _j
+
+# duplicate id in SRT
+bad = tmp / "dup.srt"
+bad.write_text("1\n00:00:00,000 --> 00:00:01,000\na\n\n1\n00:00:01,000 --> 00:00:02,000\nb\n", encoding="utf-8")
+try:
+    validate_srt(parse_srt(bad))
+    fail("dup id should have errored")
+except SystemExit as e:
+    ok(f"dup id: {e}")
+
+# overlap
+bad.write_text("1\n00:00:00,000 --> 00:00:03,000\na\n\n2\n00:00:02,000 --> 00:00:04,000\nb\n", encoding="utf-8")
+try:
+    validate_srt(parse_srt(bad))
+    fail("overlap should have errored")
+except SystemExit as e:
+    ok(f"overlap: {e}")
+
+# non-monotonic
+bad.write_text("1\n00:00:05,000 --> 00:00:07,000\na\n\n2\n00:00:00,000 --> 00:00:02,000\nb\n", encoding="utf-8")
+try:
+    validate_srt(parse_srt(bad))
+    fail("non-monotonic should have errored")
+except SystemExit as e:
+    ok(f"non-monotonic: {e}")
+
+# end <= start in plan
+bad_plan = tmp / "bad_plan.json"
+bad_plan.write_text(_j.dumps([{"id": 1, "source_start": "00:00:05,000", "source_end": "00:00:03,000"}]), encoding="utf-8")
+try:
+    s, v, ents = parse_plan(bad_plan)
+    validate_plan(ents, s, v, Path("/fake/source.mp4"))
+    fail("end<=start should have errored")
+except SystemExit as e:
+    ok(f"end<=start: {e}")
+
+# negative source_start
+bad_plan.write_text(_j.dumps([{"id": 1, "source_start": "00:00:00,000", "source_end": "00:00:03,000"}]), encoding="utf-8")
+s, v, ents = parse_plan(bad_plan)
+ents[0].source_start = -1.0
+try:
+    validate_plan(ents, s, v, Path("/fake/source.mp4"))
+    fail("negative start should have errored")
+except SystemExit as e:
+    ok(f"negative start: {e}")
+
+# id mismatch
+ok_srt = parse_srt(base / "script.srt")
+s, v, ents = parse_plan(base / "edit_plan.json")
+from srt_driven_edit import PlanEntry
+ents.append(PlanEntry(id=99, source_name="_default", source_start=0.0, source_end=1.0, voice_name=None))
+try:
+    validate_alignment(ok_srt, ents)
+    fail("id mismatch should have errored")
+except SystemExit as e:
+    ok(f"id mismatch: {e}")
+
+
+# -- 9. Alignment + gap handling on real example -------------------------
+
+section("alignment on script.srt + edit_plan.json")
+s, v, ents = parse_plan(base / "edit_plan.json")
+segs = align(parse_srt(base / "script.srt"), ents, s, v,
+             legacy_default_source=Path("/fake/source.mp4"),
+             tolerance=0.5, trim_direction="tail", on_short="error")
+for sg in segs:
+    print(f"  id={sg.id} src[{sg.source_start:.3f}-{sg.source_end:.3f}] "
+          f"out[{sg.out_start:.3f}-{sg.out_end:.3f}] gap={sg.leading_gap:.3f}")
+assert abs(segs[-1].out_end - 12.0) < 1e-6
+assert abs(segs[2].leading_gap - 1.5) < 1e-6
+ok("12.0s total, 1.5s gap before id=3")
+
+
+
+# -- 10. ensure_safe_subs_path self-defense ------------------------------
+
+section("ensure_safe_subs_path")
+# safe path (already ASCII, no single quote): returned as-is
+safe_in = tmp / "plain.srt"
+safe_in.write_text("1\n00:00:00,000 --> 00:00:01,000\nhi\n", encoding="utf-8")
+out, cleanup = ensure_safe_subs_path(safe_in)
+assert out == safe_in and cleanup is None
+ok(f"ascii input returned as-is: {out.name}")
+
+# unsafe path: CJK in name → copied to safe location
+cjk_in = tmp / "中文 字幕.srt"
+cjk_in.write_text("1\n00:00:00,000 --> 00:00:01,000\nhi\n", encoding="utf-8")
+out, cleanup = ensure_safe_subs_path(cjk_in)
+assert out != cjk_in and cleanup == out
+assert str(out).isascii(), f"safe copy still has non-ASCII chars: {out}"
+assert out.read_text(encoding="utf-8").startswith("1")
+ok(f"CJK input copied to safe path: {out}")
+cleanup.unlink()
+
+# unsafe path: single quote in name → also copied
+quote_in = tmp / "it's mine.srt"
+quote_in.write_text("1\n00:00:00,000 --> 00:00:01,000\nhi\n", encoding="utf-8")
+out, cleanup = ensure_safe_subs_path(quote_in)
+assert out != quote_in and "'" not in str(out)
+ok(f"single-quote input copied to safe path: {out}")
+cleanup.unlink()
+
+# unsafe + non-UTF-8 input gets normalized through read_srt_text
+gbk_in = tmp / "gbk 字幕.srt"
+gbk_in.write_bytes("1\n00:00:00,000 --> 00:00:01,000\n中文\n".encode("gb18030"))
+out, cleanup = ensure_safe_subs_path(gbk_in)
+assert "中文" in out.read_text(encoding="utf-8")
+ok(f"GB18030 + CJK path → normalized utf-8 safe copy")
+cleanup.unlink()
+
+# _path_is_filter_safe sanity
+assert _path_is_filter_safe(Path("/tmp/foo.srt")) is True
+assert _path_is_filter_safe(Path("/tmp/视频.srt")) is False
+assert _path_is_filter_safe(Path("/tmp/it's.srt")) is False
+ok("_path_is_filter_safe correctly flags non-ASCII and single quote")
+
+
+# -- 11. Cache key fingerprinting ---------------------------------------
+
+section("cache_key includes params fingerprint + ffmpeg version")
+assert isinstance(PARAMS_FINGERPRINT, str) and len(PARAMS_FINGERPRINT) == 10
+ok(f"PARAMS_FINGERPRINT = {PARAMS_FINGERPRINT}")
+assert CACHE_VERSION == 2
+ok(f"CACHE_VERSION bumped to {CACHE_VERSION}")
+
+# Build a fake segment pointed at a real file (this script) so _file_fingerprint works
+fake_seg = Segment(
+    id=1,
+    source_path=Path(__file__).resolve(),
+    source_start=0.0,
+    source_end=1.0,
+    out_start=0.0,
+    out_end=1.0,
+    leading_gap=0.0,
+    text="x",
+    voice_path=None,
+    pad_short=False,
+    plan_src_dur=1.0,
+)
+k_v60 = cache_key(fake_seg, effective_bg_volume=0.0, hdr=False, portrait=False,
+                  voice_signature=None, ffmpeg_version="6.0")
+k_v71 = cache_key(fake_seg, effective_bg_volume=0.0, hdr=False, portrait=False,
+                  voice_signature=None, ffmpeg_version="7.1")
+assert k_v60 != k_v71, "different ffmpeg versions should produce different cache keys"
+ok(f"ffmpeg 6.0 → {k_v60[:16]}…, 7.1 → {k_v71[:16]}… (differ)")
+
+k_bg0 = cache_key(fake_seg, effective_bg_volume=0.0, hdr=False, portrait=False,
+                  voice_signature=None, ffmpeg_version="6.0")
+k_bg1 = cache_key(fake_seg, effective_bg_volume=0.1, hdr=False, portrait=False,
+                  voice_signature=None, ffmpeg_version="6.0")
+assert k_bg0 != k_bg1, "different effective bg_volume must invalidate cache"
+ok("effective bg_volume differs → cache key differs")
+
+
+# -- 12. preflight + probe_streams (best-effort, ffmpeg may be absent) ---
+
+section("preflight + probe_streams (only if ffmpeg installed)")
+import shutil as _sh
+import subprocess as _sp
+if _sh.which("ffmpeg") and _sh.which("ffprobe"):
+    from srt_driven_edit import preflight, probe_streams
+    versions = preflight()
+    assert "ffmpeg" in versions and "ffprobe" in versions
+    ok(f"preflight ok: {versions}")
+
+    # Build a 0.5s test mp4 with video + audio via lavfi
+    av_mp4 = tmp / "probe_av.mp4"
+    _sp.run([
+        "ffmpeg", "-y", "-hide_banner", "-loglevel", "error",
+        "-f", "lavfi", "-i", "color=c=red:s=320x240:r=24:d=0.5",
+        "-f", "lavfi", "-i", "anullsrc=channel_layout=stereo:sample_rate=48000",
+        "-t", "0.5",
+        "-c:v", "libx264", "-pix_fmt", "yuv420p",
+        "-c:a", "aac",
+        str(av_mp4),
+    ], check=True)
+    info = probe_streams(av_mp4)
+    assert info["has_video"] is True and info["has_audio"] is True
+    assert abs(info["duration"] - 0.5) < 0.1
+    ok(f"probe video+audio mp4: {info}")
+
+    # Video-only mp4 → has_audio False, exercises the auto-degrade path
+    v_only = tmp / "probe_vonly.mp4"
+    _sp.run([
+        "ffmpeg", "-y", "-hide_banner", "-loglevel", "error",
+        "-f", "lavfi", "-i", "color=c=red:s=320x240:r=24:d=0.5",
+        "-an", "-t", "0.5",
+        "-c:v", "libx264", "-pix_fmt", "yuv420p",
+        str(v_only),
+    ], check=True)
+    info = probe_streams(v_only)
+    assert info["has_video"] is True and info["has_audio"] is False
+    ok(f"probe video-only mp4: {info}")
+
+    # Garbage input → SystemExit, not a silent pass
+    junk = tmp / "junk.mp4"
+    junk.write_bytes(b"not a media file")
+    try:
+        probe_streams(junk)
+        fail("probe_streams on garbage should have raised")
+    except SystemExit as e:
+        ok(f"probe_streams hard-fails on junk: {str(e)[:80]}")
+else:
+    ok("ffmpeg not on PATH — preflight/probe_streams tests skipped")
+
+
+print("\n=== ALL TESTS PASSED ===")
diff --git a/examples/srt_driven/edit_plan.json b/examples/srt_driven/edit_plan.json
new file mode 100644
index 0000000..0e15d43
--- /dev/null
+++ b/examples/srt_driven/edit_plan.json
@@ -0,0 +1,17 @@
+[
+  {
+    "id": 1,
+    "source_start": "00:12:30,000",
+    "source_end": "00:12:33,000"
+  },
+  {
+    "id": 2,
+    "source_start": "00:18:05,000",
+    "source_end": "00:18:09,000"
+  },
+  {
+    "id": 3,
+    "source_start": "00:22:14,500",
+    "source_end": "00:22:18,000"
+  }
+]
diff --git a/examples/srt_driven/edit_plan_v2.json b/examples/srt_driven/edit_plan_v2.json
new file mode 100644
index 0000000..fd7530b
--- /dev/null
+++ b/examples/srt_driven/edit_plan_v2.json
@@ -0,0 +1,32 @@
+{
+  "sources": {
+    "A": "raw/take_a.mp4",
+    "B": "raw/take_b.mp4"
+  },
+  "voices": {
+    "host": "voice/host.wav",
+    "guest": "voice/guest.wav"
+  },
+  "segments": [
+    {
+      "id": 1,
+      "source": "A",
+      "source_start": "00:12:30,000",
+      "source_end": "00:12:33,000",
+      "voice": "host"
+    },
+    {
+      "id": 2,
+      "source": "B",
+      "source_start": "00:18:05,000",
+      "source_end": "00:18:09,000",
+      "voice": "guest"
+    },
+    {
+      "id": 3,
+      "source": "A",
+      "source_start": "00:22:14,500",
+      "source_end": "00:22:18,000"
+    }
+  ]
+}
diff --git a/examples/srt_driven/jobs.json b/examples/srt_driven/jobs.json
new file mode 100644
index 0000000..88f1aef
--- /dev/null
+++ b/examples/srt_driven/jobs.json
@@ -0,0 +1,25 @@
+[
+  {
+    "name": "promo_en",
+    "source": "raw/take_a.mp4",
+    "srt": "script.srt",
+    "plan": "edit_plan.json",
+    "bg_volume": 0.0,
+    "style": "bold-uppercase"
+  },
+  {
+    "name": "promo_cn",
+    "source": "raw/take_a.mp4",
+    "srt": "script_cjk.srt",
+    "plan": "edit_plan.json",
+    "bg_volume": 0.1,
+    "style": "cjk-natural"
+  },
+  {
+    "name": "promo_multi",
+    "srt": "script.srt",
+    "plan": "edit_plan_v2.json",
+    "bg_volume": 0.0,
+    "style": "auto"
+  }
+]
diff --git a/examples/srt_driven/script.srt b/examples/srt_driven/script.srt
new file mode 100644
index 0000000..dde4617
--- /dev/null
+++ b/examples/srt_driven/script.srt
@@ -0,0 +1,11 @@
+1
+00:00:00,000 --> 00:00:03,000
+Ninety percent of what an agent does is wasted.
+
+2
+00:00:03,000 --> 00:00:07,000
+We rewrote the planner from scratch this quarter.
+
+3
+00:00:08,500 --> 00:00:12,000
+Here is what changed and what it cost us.
diff --git a/examples/srt_driven/script_cjk.srt b/examples/srt_driven/script_cjk.srt
new file mode 100644
index 0000000..04dc35f
--- /dev/null
+++ b/examples/srt_driven/script_cjk.srt
@@ -0,0 +1,11 @@
+1
+00:00:00,000 --> 00:00:03,000
+百分之九十的 agent 工作都被浪费了。
+
+2
+00:00:03,000 --> 00:00:07,000
+我们这季度把 planner 重写了一遍。
+
+3
+00:00:08,500 --> 00:00:12,000
+这里讲一下改了什么、代价是什么。
diff --git a/helpers/recommend_edit_plan.py b/helpers/recommend_edit_plan.py
new file mode 100644
index 0000000..7db1c21
--- /dev/null
+++ b/helpers/recommend_edit_plan.py
@@ -0,0 +1,630 @@
+"""Recommend an edit_plan.json from script.srt + source transcript.
+
+Pipeline position:
+    script.srt + transcript.json
+      --(this script)-->
+    edit_plan.json + edit_plan_review.md
+      --(srt_driven_edit.py)-->
+    final.mp4
+
+Matching is best-effort LEXICAL (no LLM, no semantic understanding):
+    1. Parse Scribe JSON → keep only timestamped 'word' tokens. Without
+       word-level start/end timestamps we cannot produce reliable
+       source_start / source_end, so plain-text transcripts are not usable.
+    2. Build candidate ranges by breaking on sentence-end punctuation,
+       silences ≥ gap_threshold, or speaker change; split long candidates
+       at phrase punctuation then by hard word-level windows.
+    3. For each SRT cue, score every candidate by:
+         0.6 * SequenceMatcher(normalized chars)
+         + 0.4 * Jaccard (token-level for Latin / 2-gram for CJK)
+         blended with duration similarity at 0.7 / 0.3.
+       The matcher cannot understand storyline — if the SRT narration uses
+       words not present in the source transcript, scores will be low and
+       matches will need manual review.
+    4. Greedy assignment, no reuse unless --allow-reuse.
+    5. Emit Form-A or Form-B plan + a sidecar review markdown.
+
+Reserved CLI flags (placeholders, not yet wired up):
+  --packed         takes_packed.md input  (use --transcript for now)
+  --context-window padding around matched ranges
+
+Usage:
+    python helpers/recommend_edit_plan.py \\
+      --script script.srt \\
+      --transcript edit/transcripts/source.json \\
+      --source source.mp4 \\
+      -o edit_plan.json
+    python helpers/srt_driven_edit.py \\
+      --source source.mp4 --srt script.srt --plan edit_plan.json -o final.mp4
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from dataclasses import dataclass, field
+from difflib import SequenceMatcher
+from pathlib import Path
+
+try:
+    from srt_driven_edit import (
+        parse_srt as _parse_srt,
+        format_srt_ts,
+        CJK_RE,
+        SrtCue,  # only for type hints
+    )
+except Exception as e:
+    raise SystemExit(
+        "recommend_edit_plan: failed to import from srt_driven_edit.py. "
+        f"Both files must be importable from the same helpers/ dir. ({e})"
+    )
+
+
+# ============================================================================
+# Candidate parsing
+# ============================================================================
+
+
+SENT_END_PUNCT = set(".?!。？！")
+PHRASE_PUNCT = set(",;:，；：、")
+
+
+@dataclass
+class Candidate:
+    start: float
+    end: float
+    text: str
+
+    @property
+    def duration(self) -> float:
+        return self.end - self.start
+
+
+def load_transcript_words(path: Path, keep_audio_events: bool = False) -> list[dict]:
+    """Return Scribe word tokens with valid timestamps. Optionally keep audio events."""
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as e:
+        raise SystemExit(f"transcript not valid JSON: {path}: {e}")
+    words = data.get("words")
+    if not isinstance(words, list):
+        raise SystemExit(f"transcript missing 'words' list: {path}")
+    out: list[dict] = []
+    for w in words:
+        wt = w.get("type")
+        if wt == "word":
+            if w.get("start") is None or w.get("end") is None:
+                continue
+            out.append(w)
+        elif wt == "audio_event" and keep_audio_events:
+            out.append(w)
+    if not out:
+        raise SystemExit(f"transcript has no usable word tokens: {path}")
+    return out
+
+
+def _join_words(words: list[dict]) -> str:
+    """Concatenate word texts. Single space between. CJK joiners are removed
+    again at normalize time so this is safe even when neighbors are Chinese."""
+    return " ".join((w.get("text") or "").strip() for w in words if (w.get("text") or "").strip())
+
+
+def _hard_split(part: list[dict], max_dur: float) -> list[Candidate]:
+    """Walk word-by-word, close a chunk as soon as adding the next word would
+    exceed max_dur. Every emitted chunk lands on a word boundary by construction.
+    """
+    out: list[Candidate] = []
+    chunk: list[dict] = []
+    cs = float(part[0]["start"])
+    for w in part:
+        we = float(w["end"])
+        if chunk and (we - cs) > max_dur:
+            ce = float(chunk[-1]["end"])
+            out.append(Candidate(cs, ce, _join_words(chunk)))
+            chunk = []
+            cs = float(w["start"])
+        chunk.append(w)
+    if chunk:
+        out.append(Candidate(cs, float(chunk[-1]["end"]), _join_words(chunk)))
+    return out
+
+
+def build_candidates(
+    words: list[dict],
+    *,
+    gap_threshold: float = 0.5,
+    max_dur: float = 12.0,
+    min_dur: float = 0.4,
+) -> list[Candidate]:
+    """Group words into phrase-level candidates. Non-overlapping by construction."""
+    # Step 1: raw groups by sentence-end punct / silence / speaker change
+    raw_groups: list[list[dict]] = []
+    current: list[dict] = []
+    prev_end: float | None = None
+    prev_speaker: str | None = None
+    for w in words:
+        if w.get("type") != "word":
+            continue
+        text = (w.get("text") or "").strip()
+        if not text:
+            continue
+        ws = float(w["start"])
+        we = float(w["end"])
+        speaker = w.get("speaker_id")
+        if prev_speaker is not None and speaker is not None and speaker != prev_speaker:
+            if current:
+                raw_groups.append(current); current = []
+        if prev_end is not None and (ws - prev_end) >= gap_threshold:
+            if current:
+                raw_groups.append(current); current = []
+        current.append(w)
+        prev_end = we
+        prev_speaker = speaker
+        if text[-1] in SENT_END_PUNCT:
+            raw_groups.append(current); current = []
+    if current:
+        raw_groups.append(current)
+
+    # Step 2: split groups that exceed max_dur — phrase punct first, then hard
+    out: list[Candidate] = []
+    for group in raw_groups:
+        if not group:
+            continue
+        start = float(group[0]["start"])
+        end = float(group[-1]["end"])
+        if end - start <= max_dur:
+            out.append(Candidate(start, end, _join_words(group)))
+            continue
+        parts: list[list[dict]] = []
+        buf: list[dict] = []
+        for w in group:
+            buf.append(w)
+            text = (w.get("text") or "").strip()
+            if text and text[-1] in PHRASE_PUNCT:
+                parts.append(buf); buf = []
+        if buf:
+            parts.append(buf)
+        for part in parts:
+            ps = float(part[0]["start"]); pe = float(part[-1]["end"])
+            if pe - ps <= max_dur:
+                out.append(Candidate(ps, pe, _join_words(part)))
+            else:
+                out.extend(_hard_split(part, max_dur))
+
+    return [c for c in out if c.duration >= min_dur]
+
+
+# ============================================================================
+# Scoring
+# ============================================================================
+
+
+# Keep word characters, whitespace, and CJK ranges; replace everything else
+# (punctuation, brackets, audio-event markers) with a space.
+_NORMALIZE_RE = re.compile(
+    r"[^\w\s一-鿿㐀-䶿぀-ゟ゠-ヿ가-힯]+",
+    flags=re.UNICODE,
+)
+_WS_RE = re.compile(r"\s+")
+
+
+def normalize_text(text: str) -> str:
+    s = text.casefold()
+    s = _NORMALIZE_RE.sub(" ", s)
+    s = _WS_RE.sub(" ", s).strip()
+    return s
+
+
+def is_cjk_heavy(text: str) -> bool:
+    """True if at least half of the non-whitespace characters are CJK."""
+    chars = [c for c in text if not c.isspace()]
+    if not chars:
+        return False
+    cjk = sum(1 for c in chars if CJK_RE.match(c))
+    return cjk * 2 >= len(chars)
+
+
+def _tokens(text: str) -> list[str]:
+    return text.split()
+
+
+def _char_bigrams(text: str) -> set[str]:
+    chars = [c for c in text if not c.isspace()]
+    return {"".join(chars[i:i + 2]) for i in range(len(chars) - 1)}
+
+
+def _jaccard(a: set | list, b: set | list) -> float:
+    sa, sb = set(a), set(b)
+    if not sa and not sb:
+        return 1.0
+    if not sa or not sb:
+        return 0.0
+    return len(sa & sb) / len(sa | sb)
+
+
+def text_similarity(cue_text: str, cand_text: str) -> float:
+    """Blend of SequenceMatcher (local structure) and Jaccard (bag of units)."""
+    a = normalize_text(cue_text)
+    b = normalize_text(cand_text)
+    if not a or not b:
+        return 0.0
+    seq = SequenceMatcher(None, a, b, autojunk=False).ratio()
+    if is_cjk_heavy(a) or is_cjk_heavy(b):
+        jc = _jaccard(_char_bigrams(a), _char_bigrams(b))
+    else:
+        jc = _jaccard(_tokens(a), _tokens(b))
+    return 0.6 * seq + 0.4 * jc
+
+
+def duration_similarity(cand_dur: float, cue_dur: float) -> float:
+    if cue_dur <= 0:
+        return 0.0
+    delta = abs(cand_dur - cue_dur)
+    return 1.0 / (1.0 + delta / cue_dur)
+
+
+def combined_score(cue: SrtCue, cand: Candidate,
+                   w_text: float = 0.7, w_dur: float = 0.3) -> float:
+    return (
+        w_text * text_similarity(cue.text, cand.text)
+        + w_dur * duration_similarity(cand.duration, cue.duration)
+    )
+
+
+# ============================================================================
+# Assignment
+# ============================================================================
+
+
+@dataclass
+class Assignment:
+    cue_id: int
+    cue_text: str
+    cue_duration: float
+    cand: Candidate | None
+    score: float
+    warnings: list[str] = field(default_factory=list)
+
+
+def assign(
+    cues: list[SrtCue],
+    candidates: list[Candidate],
+    *,
+    allow_reuse: bool = False,
+    min_score: float = 0.35,
+    duration_warn_ratio: float = 0.5,
+    monotonic_source: bool = False,
+    max_source_gap_warn: float | None = None,
+) -> list[Assignment]:
+    """Pick the best candidate for each cue in id order.
+
+    monotonic_source: when True, a candidate is only considered if its
+      start time is >= the previously assigned candidate's end. Prevents
+      narrative time reversal when the same line appears multiple times
+      in the source (the matcher can otherwise pick an earlier instance
+      for a later cue).
+
+    max_source_gap_warn: if set, any adjacent assignment pair whose
+      absolute source-time gap exceeds the threshold gets a warning.
+      Soft signal — does not affect selection.
+
+    Even in non-monotonic mode, a backward source-time jump always
+    earns a warning so the review markdown surfaces it.
+    """
+    used: set[int] = set()
+    out: list[Assignment] = []
+    # Floor that the NEXT candidate's start must clear under monotonic mode.
+    min_start_floor = 0.0
+
+    for cue in cues:
+        best_idx = -1
+        best_score = -1.0
+        for i, cand in enumerate(candidates):
+            if not allow_reuse and i in used:
+                continue
+            if monotonic_source and cand.start < min_start_floor - 1e-6:
+                continue
+            s = combined_score(cue, cand)
+            if s > best_score:
+                best_score = s
+                best_idx = i
+
+        warns: list[str] = []
+        cand_out: Candidate | None = None
+        if best_idx < 0:
+            if monotonic_source:
+                warns.append(
+                    f"no candidate available at or after source time "
+                    f"{format_srt_ts(min_start_floor)} (monotonic constraint)"
+                )
+            else:
+                warns.append("no candidate available")
+            score_out = 0.0
+        else:
+            cand_out = candidates[best_idx]
+            score_out = best_score
+            if not allow_reuse:
+                used.add(best_idx)
+            if monotonic_source:
+                # Next cue must start at or after this candidate's end.
+                min_start_floor = cand_out.end
+            if best_score < min_score:
+                warns.append(f"low score {best_score:.3f} < {min_score}")
+            if cue.duration > 0:
+                dd_ratio = abs(cand_out.duration - cue.duration) / cue.duration
+                if dd_ratio > duration_warn_ratio:
+                    warns.append(
+                        f"duration mismatch: cand {cand_out.duration:.2f}s vs "
+                        f"cue {cue.duration:.2f}s ({dd_ratio:.0%} off)"
+                    )
+            if cand_out.duration + 1e-6 < cue.duration:
+                warns.append(
+                    "candidate shorter than cue — will need `--on-short pad` "
+                    "in srt_driven_edit"
+                )
+        out.append(Assignment(
+            cue_id=cue.id, cue_text=cue.text, cue_duration=cue.duration,
+            cand=cand_out, score=score_out, warnings=warns,
+        ))
+
+    # Post-pass: surface source-time discontinuities as warnings on the
+    # later cue of the pair. Backward jumps are flagged in non-monotonic
+    # mode (impossible by construction in monotonic mode). Large gaps are
+    # flagged in both modes when --max-source-gap is set.
+    for i in range(1, len(out)):
+        prev_cand = out[i - 1].cand
+        curr_cand = out[i].cand
+        if prev_cand is None or curr_cand is None:
+            continue
+        gap = curr_cand.start - prev_cand.end
+        if not monotonic_source and gap < -1e-3:
+            out[i].warnings.append(
+                f"source time goes backward {gap:+.2f}s: prev cue ends at "
+                f"{format_srt_ts(prev_cand.end)}, this cue starts at "
+                f"{format_srt_ts(curr_cand.start)}"
+            )
+        if max_source_gap_warn is not None and abs(gap) > max_source_gap_warn:
+            out[i].warnings.append(
+                f"source-time jump {gap:+.2f}s exceeds "
+                f"--max-source-gap {max_source_gap_warn:.2f}s"
+            )
+
+    return out
+
+
+# ============================================================================
+# Output writers
+# ============================================================================
+
+
+def _require_all_assigned(assignments: list[Assignment]) -> None:
+    missing = [a.cue_id for a in assignments if a.cand is None]
+    if missing:
+        raise SystemExit(
+            f"no candidate found for cue(s) {missing}. "
+            "Add transcript coverage, lower --gap-threshold, or pass --allow-reuse."
+        )
+
+
+def write_plan_form_a(assignments: list[Assignment], out_path: Path) -> None:
+    _require_all_assigned(assignments)
+    rows = [
+        {
+            "id": a.cue_id,
+            "source_start": format_srt_ts(a.cand.start),
+            "source_end": format_srt_ts(a.cand.end),
+        }
+        for a in assignments
+    ]
+    out_path.write_text(json.dumps(rows, indent=2, ensure_ascii=False), encoding="utf-8")
+
+
+def write_plan_form_b(
+    assignments: list[Assignment],
+    source_path: Path,
+    source_name: str,
+    out_path: Path,
+) -> None:
+    _require_all_assigned(assignments)
+    data = {
+        "sources": {source_name: str(source_path)},
+        "segments": [
+            {
+                "id": a.cue_id,
+                "source": source_name,
+                "source_start": format_srt_ts(a.cand.start),
+                "source_end": format_srt_ts(a.cand.end),
+            }
+            for a in assignments
+        ],
+    }
+    out_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
+
+
+def write_review(assignments: list[Assignment], out_path: Path) -> None:
+    lines: list[str] = ["# Edit plan review", ""]
+    total = len(assignments)
+    matched = sum(1 for a in assignments if a.cand is not None)
+    warned = sum(1 for a in assignments if a.warnings)
+    avg = (sum(a.score for a in assignments if a.cand) / max(matched, 1))
+    lines.append(f"- total cues: {total}")
+    lines.append(f"- matched: {matched}/{total}")
+    lines.append(f"- with warnings: {warned}")
+    lines.append(f"- average score: {avg:.3f}")
+    lines.append("")
+    for a in assignments:
+        lines.append(f"## cue id={a.cue_id}")
+        lines.append(f"- **cue text**: {a.cue_text!r}")
+        lines.append(f"- **cue duration**: {a.cue_duration:.3f}s")
+        if a.cand is None:
+            lines.append("- **match**: NONE")
+        else:
+            lines.append(f"- **matched text**: {a.cand.text!r}")
+            lines.append(
+                f"- **source range**: {format_srt_ts(a.cand.start)} → "
+                f"{format_srt_ts(a.cand.end)} ({a.cand.duration:.3f}s)"
+            )
+            lines.append(f"- **score**: {a.score:.3f}")
+            dd = a.cand.duration - a.cue_duration
+            lines.append(f"- **duration delta**: {dd:+.3f}s")
+        for w in a.warnings:
+            lines.append(f"- **WARNING**: {w}")
+        lines.append("")
+    out_path.write_text("\n".join(lines), encoding="utf-8")
+
+
+# ============================================================================
+# Top-level callable (used by CLI and tests)
+# ============================================================================
+
+
+def recommend(
+    *,
+    script_srt: Path,
+    transcript: Path,
+    source: Path,
+    output: Path,
+    review: Path | None = None,
+    source_name: str = "A",
+    output_format: str = "form-a",
+    gap_threshold: float = 0.5,
+    max_cand_dur: float = 12.0,
+    min_cand_dur: float = 0.4,
+    min_score: float = 0.35,
+    allow_reuse: bool = False,
+    keep_audio_events: bool = False,
+    monotonic_source: bool = False,
+    max_source_gap_warn: float | None = None,
+) -> list[Assignment]:
+    cues = _parse_srt(script_srt)
+    if not cues:
+        raise SystemExit(f"script.srt has no cues: {script_srt}")
+
+    words = load_transcript_words(transcript, keep_audio_events=keep_audio_events)
+    candidates = build_candidates(
+        words,
+        gap_threshold=gap_threshold,
+        max_dur=max_cand_dur,
+        min_dur=min_cand_dur,
+    )
+    if not candidates:
+        raise SystemExit(
+            f"no candidates built from transcript {transcript}. "
+            "Try lowering --min-cand-dur or check transcript quality."
+        )
+
+    assignments = assign(
+        cues, candidates,
+        allow_reuse=allow_reuse, min_score=min_score,
+        monotonic_source=monotonic_source,
+        max_source_gap_warn=max_source_gap_warn,
+    )
+
+    if output_format == "form-a":
+        write_plan_form_a(assignments, output)
+    elif output_format == "form-b":
+        write_plan_form_b(assignments, source, source_name, output)
+    else:
+        raise SystemExit(f"unknown --format: {output_format}")
+
+    if review is None:
+        review = output.with_name(output.stem + "_review.md")
+    write_review(assignments, review)
+    return assignments
+
+
+# ============================================================================
+# CLI
+# ============================================================================
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(
+        description="Recommend edit_plan.json from script.srt + Scribe transcript",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=(
+            "Example:\n"
+            "  python helpers/recommend_edit_plan.py \\\n"
+            "    --script script.srt \\\n"
+            "    --transcript edit/transcripts/source.json \\\n"
+            "    --source source.mp4 \\\n"
+            "    -o edit_plan.json\n"
+            "  python helpers/srt_driven_edit.py \\\n"
+            "    --source source.mp4 --srt script.srt --plan edit_plan.json -o final.mp4"
+        ),
+    )
+    ap.add_argument("--script", type=Path, required=True,
+                    help="script.srt (target captions timeline)")
+    ap.add_argument("--transcript", type=Path, required=True,
+                    help="Scribe transcript JSON")
+    ap.add_argument("--source", type=Path, required=True,
+                    help="source.mp4 path (recorded in Form-B plans)")
+    ap.add_argument("--packed", type=Path, default=None,
+                    help="optional takes_packed.md (reserved; unused in v1)")
+    ap.add_argument("--source-name", default="A",
+                    help="Form-B source name (default 'A')")
+    ap.add_argument("--context-window", type=float, default=1.5,
+                    help="reserved for future use")
+    ap.add_argument("--gap-threshold", type=float, default=0.5,
+                    help="silence gap (s) that breaks a candidate. default 0.5")
+    ap.add_argument("--max-cand-dur", type=float, default=12.0,
+                    help="max candidate duration before forced split. default 12.0")
+    ap.add_argument("--min-cand-dur", type=float, default=0.4,
+                    help="drop candidates shorter than this. default 0.4")
+    ap.add_argument("--min-score", type=float, default=0.35,
+                    help="score below this triggers a warning. default 0.35")
+    ap.add_argument("--allow-reuse", action="store_true",
+                    help="allow one candidate to be assigned to multiple cues")
+    ap.add_argument("--keep-audio-events", action="store_true",
+                    help="keep (laughter) (applause) tokens as candidate context")
+    ap.add_argument("--monotonic-source", action="store_true",
+                    help="require each cue's source range to start at or after "
+                         "the previous cue's match. Prevents narrative time "
+                         "reversal when the same line appears multiple times "
+                         "in the source.")
+    ap.add_argument("--max-source-gap", type=float, default=None,
+                    help="seconds. When set, any adjacent assignment whose "
+                         "|source-time gap| exceeds this earns a warning.")
+    ap.add_argument("--format", choices=["form-a", "form-b"], default="form-a",
+                    dest="output_format")
+    ap.add_argument("-o", "--output", type=Path, required=True,
+                    help="edit_plan.json path")
+    ap.add_argument("--review", type=Path, default=None,
+                    help="review .md path (default: <output>_review.md)")
+    args = ap.parse_args()
+
+    assignments = recommend(
+        script_srt=args.script.resolve(),
+        transcript=args.transcript.resolve(),
+        source=args.source.resolve(),
+        output=args.output.resolve(),
+        review=args.review.resolve() if args.review else None,
+        source_name=args.source_name,
+        output_format=args.output_format,
+        gap_threshold=args.gap_threshold,
+        max_cand_dur=args.max_cand_dur,
+        min_cand_dur=args.min_cand_dur,
+        min_score=args.min_score,
+        allow_reuse=args.allow_reuse,
+        keep_audio_events=args.keep_audio_events,
+        monotonic_source=args.monotonic_source,
+        max_source_gap_warn=args.max_source_gap,
+    )
+
+    matched = sum(1 for a in assignments if a.cand is not None)
+    warned = sum(1 for a in assignments if a.warnings)
+    avg = sum(a.score for a in assignments if a.cand is not None) / max(matched, 1)
+    review_path = (
+        args.review.resolve() if args.review
+        else args.output.resolve().with_name(args.output.stem + "_review.md")
+    )
+    print(f"wrote plan → {args.output}")
+    print(f"wrote review → {review_path}")
+    print(f"  {matched}/{len(assignments)} cues matched, avg score {avg:.3f}, "
+          f"{warned} with warnings")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/helpers/run_episodes.py b/helpers/run_episodes.py
new file mode 100644
index 0000000..6047c85
--- /dev/null
+++ b/helpers/run_episodes.py
@@ -0,0 +1,248 @@
+"""Run srt_driven_edit across every episode subdirectory under a root.
+
+Discovery convention (flat per-episode layout):
+    <root>/<ep>/source.mp4       required
+    <root>/<ep>/script.srt       required
+    <root>/<ep>/edit_plan.json   required  (Form A or B)
+    <root>/<ep>/voice.wav        optional  (global voice for this ep)
+
+Outputs:
+    <root>/<ep>/final.mp4
+    <root>/<ep>/edit/...         (EDL, QC report, cache — managed by srt_driven_edit)
+    <root>/run_episodes_summary.json
+
+Usage:
+    python helpers/run_episodes.py batch/
+    python helpers/run_episodes.py batch/ --bg-volume 0.1 --style cjk-natural
+    python helpers/run_episodes.py batch/ --continue-on-error
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import time
+from dataclasses import dataclass
+from pathlib import Path
+
+try:
+    from srt_driven_edit import (
+        Job, run_job, preflight, safe_ascii_name,
+        make_failure_record,
+    )
+except Exception as e:
+    raise SystemExit(
+        "run_episodes: failed to import from srt_driven_edit.py. "
+        f"Both files must be importable from the same helpers/ dir. ({e})"
+    )
+
+
+REQUIRED_FILES = ("source.mp4", "script.srt", "edit_plan.json")
+OPTIONAL_VOICE = "voice.wav"
+
+
+@dataclass
+class EpisodeJob:
+    name: str
+    root: Path
+    source: Path
+    srt: Path
+    plan: Path
+    voice: Path | None
+
+
+def discover_episodes(root: Path) -> list[EpisodeJob]:
+    """Return episode dirs under `root` that have the required file set.
+
+    Dirs missing a required file are skipped with a printed reason — never
+    cause a hard failure here, so a partial batch is still actionable.
+    Hard-fails only if NO usable dir is found.
+    """
+    if not root.is_dir():
+        raise SystemExit(f"not a directory: {root}")
+
+    eps: list[EpisodeJob] = []
+    skipped: list[tuple[str, list[str]]] = []
+    for sub in sorted(root.iterdir(), key=lambda p: p.name):
+        if not sub.is_dir():
+            continue
+        missing = [f for f in REQUIRED_FILES if not (sub / f).is_file()]
+        if missing:
+            skipped.append((sub.name, missing))
+            continue
+        voice = sub / OPTIONAL_VOICE
+        eps.append(EpisodeJob(
+            name=sub.name,
+            root=sub.resolve(),
+            source=(sub / "source.mp4").resolve(),
+            srt=(sub / "script.srt").resolve(),
+            plan=(sub / "edit_plan.json").resolve(),
+            voice=voice.resolve() if voice.is_file() else None,
+        ))
+
+    if skipped:
+        print(f"skipped {len(skipped)} dir(s) missing required files:")
+        for name, miss in skipped:
+            print(f"  {name}: missing {', '.join(miss)}")
+    if not eps:
+        raise SystemExit(
+            f"no usable episode dirs under {root}. Each ep dir needs: "
+            f"{list(REQUIRED_FILES)}"
+        )
+    return eps
+
+
+def _make_job(ep: EpisodeJob, opts: dict) -> Job:
+    return Job(
+        source=ep.source,
+        srt=ep.srt,
+        plan=ep.plan,
+        voice=ep.voice,
+        bg_volume=opts["bg_volume"],
+        tolerance=opts["tolerance"],
+        trim_direction=opts["trim_direction"],
+        on_short=opts["on_short"],
+        style=opts["style"],
+        fontsdir=opts["fontsdir"],
+        output=ep.root / "final.mp4",
+        name=ep.name,
+        no_cache=opts["no_cache"],
+        keep_intermediates=opts["keep_intermediates"],
+        no_overwrite=opts["no_overwrite"],
+        mode=opts.get("mode", "full"),
+    )
+
+
+def run_episodes(
+    root: Path,
+    *,
+    ffmpeg_version: str,
+    bg_volume: float = 0.0,
+    tolerance: float = 0.5,
+    trim_direction: str = "tail",
+    on_short: str = "error",
+    style: str = "auto",
+    fontsdir: Path | None = None,
+    no_cache: bool = False,
+    no_overwrite: bool = False,
+    keep_intermediates: bool = False,
+    continue_on_error: bool = False,
+    mode: str = "full",
+) -> dict:
+    """Discover + run every episode under `root`. Returns a summary dict and
+    also writes it to `<root>/run_episodes_summary.json`."""
+    root = root.resolve()
+    eps = discover_episodes(root)
+    print(f"\ndiscovered {len(eps)} episode(s) under {root}:")
+    for ep in eps:
+        print(f"  {ep.name}  voice={'yes' if ep.voice else 'no'}")
+
+    opts = {
+        "bg_volume": bg_volume,
+        "tolerance": tolerance,
+        "trim_direction": trim_direction,
+        "on_short": on_short,
+        "style": style,
+        "fontsdir": fontsdir,
+        "no_cache": no_cache,
+        "no_overwrite": no_overwrite,
+        "keep_intermediates": keep_intermediates,
+        "mode": mode,
+    }
+
+    results: list[dict] = []
+    t0 = time.time()
+    for i, ep in enumerate(eps):
+        print(f"\n[{i + 1}/{len(eps)}] === {ep.name} ===")
+        job = _make_job(ep, opts)
+        try:
+            qc = run_job(job, ffmpeg_version)
+            results.append(qc)
+        except (SystemExit, Exception) as e:
+            if continue_on_error:
+                print(f"[{i + 1}/{len(eps)}] FAILED: "
+                      f"{type(e).__name__}: {e}")
+                results.append(make_failure_record(
+                    index=i, name=ep.name, error=e, job=job,
+                ))
+                continue
+            raise
+
+    ok = sum(1 for r in results if r.get("ok"))
+    summary = {
+        "root": str(root),
+        "episodes_total": len(eps),
+        "ok": ok,
+        "elapsed_s": round(time.time() - t0, 2),
+        "results": results,
+    }
+    summary_path = root / "run_episodes_summary.json"
+    summary_path.write_text(
+        json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8"
+    )
+    print(f"\n{ok}/{len(results)} episodes ok ({summary['elapsed_s']}s)")
+    print(f"summary → {summary_path}")
+    return summary
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(
+        description="Run srt_driven_edit across every ep*/ subdirectory.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=(
+            "Per-episode layout:\n"
+            "  <root>/<ep>/source.mp4        required\n"
+            "  <root>/<ep>/script.srt        required\n"
+            "  <root>/<ep>/edit_plan.json    required (Form A or B)\n"
+            "  <root>/<ep>/voice.wav         optional\n\n"
+            "Outputs land at <root>/<ep>/final.mp4 with edit/ artifacts."
+        ),
+    )
+    ap.add_argument("root", type=Path,
+                    help="directory whose immediate subdirs are episodes")
+    ap.add_argument("--bg-volume", type=float, default=0.0)
+    ap.add_argument("--tolerance", type=float, default=0.5)
+    ap.add_argument("--trim-direction", choices=["tail", "head", "center"], default="tail")
+    ap.add_argument("--on-short", choices=["error", "pad"], default="error")
+    ap.add_argument("--style", default="auto")
+    ap.add_argument("--fontsdir", type=Path, default=None)
+    ap.add_argument("--no-cache", action="store_true")
+    ap.add_argument("--no-overwrite", action="store_true")
+    ap.add_argument("--keep-intermediates", action="store_true")
+    ap.add_argument("--continue-on-error", action="store_true",
+                    help="skip episodes that fail instead of aborting")
+    ap.add_argument(
+        "--mode", choices=["full", "extract"], default="full",
+        help="'full' (default) runs the complete pipeline per episode. "
+             "'extract' stops after segment extraction and saves clips "
+             "under each ep's edit/ dir; gap clips, voice mixing, "
+             "subtitle burn, and QC report are skipped.",
+    )
+    args = ap.parse_args()
+
+    versions = preflight()
+    print(f"== ffmpeg {versions['ffmpeg']} / ffprobe {versions['ffprobe']} ==")
+
+    summary = run_episodes(
+        args.root,
+        ffmpeg_version=versions["ffmpeg"],
+        bg_volume=args.bg_volume,
+        tolerance=args.tolerance,
+        trim_direction=args.trim_direction,
+        on_short=args.on_short,
+        style=args.style,
+        fontsdir=args.fontsdir.resolve() if args.fontsdir else None,
+        no_cache=args.no_cache,
+        no_overwrite=args.no_overwrite,
+        keep_intermediates=args.keep_intermediates,
+        continue_on_error=args.continue_on_error,
+        mode=args.mode,
+    )
+    # Exit nonzero if any episode failed (even with --continue-on-error,
+    # the caller probably wants to know).
+    if summary["ok"] < summary["episodes_total"]:
+        raise SystemExit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/helpers/srt_driven_edit.py b/helpers/srt_driven_edit.py
new file mode 100644
index 0000000..7a65c86
--- /dev/null
+++ b/helpers/srt_driven_edit.py
@@ -0,0 +1,1704 @@
+"""SRT-driven edit: assemble a final cut by aligning source ranges to an SRT.
+
+Independent pipeline. Does NOT touch the main render.py flow. Use when you
+have a finished script (script.srt = final captions timeline) and a list of
+source ranges keyed by SRT id.
+
+Pipeline:
+  parse SRT + plan ─> strict validate ─> align ─> resolve style
+  ─> extract segments (with cache) ─> insert gap clips ─> concat
+  ─> audio replace/mix + subtitle burn LAST (Hard Rule 1) ─> QC report
+
+Schemas (both forms accepted):
+
+  Form A — array, single source (legacy):
+    [{"id": 1, "source_start": "HH:MM:SS,ms", "source_end": "HH:MM:SS,ms"}, ...]
+    + CLI --source <path>
+
+  Form B — object, multi-source / multi-voice:
+    {
+      "sources": {"A": "path/a.mp4", "B": "path/b.mp4"},
+      "voices":  {"main": "path/v.wav"},
+      "segments": [
+        {"id": 1, "source": "A", "source_start": "...", "source_end": "...",
+         "voice": "main"},
+        {"id": 2, "source": "B", "source_start": "...", "source_end": "..."}
+      ]
+    }
+
+Batch:
+    --batch jobs.json    (array of per-job dicts, same fields as CLI flags)
+    --batch jobs.csv     (header row of the same fields)
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import hashlib
+import json
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+from typing import Any
+
+try:
+    from render import (
+        SUB_FORCE_STYLE as _RENDER_SUB_STYLE,
+        TONEMAP_CHAIN,
+        is_hdr_source,
+        is_portrait_source,
+    )
+except Exception:
+    _RENDER_SUB_STYLE = (
+        "FontName=Helvetica,FontSize=18,Bold=1,"
+        "PrimaryColour=&H00FFFFFF,OutlineColour=&H00000000,BackColour=&H00000000,"
+        "BorderStyle=1,Outline=2,Shadow=0,"
+        "Alignment=2,MarginV=90"
+    )
+    TONEMAP_CHAIN = ""
+
+    def is_hdr_source(video: Path) -> bool:  # type: ignore
+        return False
+
+    def is_portrait_source(video: Path) -> bool:  # type: ignore
+        return False
+
+
+# ============================================================================
+# Constants
+# ============================================================================
+
+FPS = 24
+SAMPLE_RATE = 48000
+AUDIO_BITRATE = "192k"
+DURATION_DRIFT_TOLERANCE_S = 0.2
+
+STYLE_TEMPLATES: dict[str, str] = {
+    "bold-uppercase": _RENDER_SUB_STYLE,
+    "cjk-natural": (
+        "FontName=Microsoft YaHei UI,FontSize=20,Bold=0,"
+        "PrimaryColour=&H00FFFFFF,OutlineColour=&H00000000,BackColour=&H00000000,"
+        "BorderStyle=1,Outline=2,Shadow=0,"
+        "Alignment=2,MarginV=90"
+    ),
+    "narrative": (
+        "FontName=Helvetica,FontSize=20,Bold=0,"
+        "PrimaryColour=&H00FFFFFF,OutlineColour=&H00000000,BackColour=&H00000000,"
+        "BorderStyle=1,Outline=2,Shadow=0,"
+        "Alignment=2,MarginV=80"
+    ),
+}
+
+CJK_RE = re.compile(
+    r"[一-鿿㐀-䶿぀-ゟ゠-ヿ가-힯]"
+)
+
+CACHE_VERSION = 2  # bumped: cache now keyed by ffmpeg version + encoding params
+
+# Encoding-affecting constants captured into a single fingerprint so that
+# any later tweak to codec / preset / sync tails forces a cache miss. If you
+# change PARAMS_FINGERPRINT's inputs, existing cached clips are auto-invalidated.
+def _params_fingerprint() -> str:
+    payload = repr([
+        "fps", 24,
+        "sr", 48000,
+        "ab", "192k",
+        "ac", 2,
+        "v_codec", "libx264", "preset", "fast", "crf", 20, "pix", "yuv420p",
+        "a_codec", "aac",
+    ])
+    return hashlib.sha1(payload.encode("utf-8")).hexdigest()[:10]
+
+
+# Encodings tried in order when reading user-supplied SRT files. Windows
+# Chinese systems frequently save as GBK/GB18030; macOS / *nix typically
+# UTF-8 (with or without BOM). cp1252 is the last-resort Western Latin1.
+SRT_ENCODINGS = ("utf-8-sig", "utf-8", "gb18030", "cp936", "cp1252")
+
+# Audio/video sync tails appended to every per-segment filter chain so that
+# each extracted clip starts at PTS 0 with monotonic timestamps. Without
+# these, concatenating many short clips accumulates sub-frame drift that
+# eventually desyncs voice from picture.
+V_SYNC_TAIL = f"fps={FPS},setpts=PTS-STARTPTS"
+A_SYNC_TAIL = "aresample=async=1:first_pts=0,asetpts=PTS-STARTPTS"
+
+PARAMS_FINGERPRINT = _params_fingerprint()
+
+
+# ============================================================================
+# Path / filter escaping
+# ============================================================================
+
+
+def subs_filter_escape(path: Path) -> str:
+    """Escape a path for use inside ffmpeg's subtitles='...' filter argument.
+
+    Order matters: backslashes first (Windows), then drive-letter colons, then
+    quotes. The path is returned in forward-slash form for libavfilter sanity.
+    """
+    s = path.resolve().as_posix()
+    s = s.replace("\\", "\\\\")
+    s = s.replace(":", r"\:")
+    s = s.replace("'", r"\'")
+    return s
+
+
+def safe_ascii_name(stem: str) -> str:
+    """Reduce a filename stem to a safe ASCII slug for intermediate files."""
+    s = re.sub(r"[^A-Za-z0-9._-]+", "_", stem)
+    s = s.strip("_") or "job"
+    return s[:48]
+
+
+def concat_quote_path(p: Path) -> str:
+    """Quote a path for ffmpeg's concat demuxer 'file' directive.
+
+    Embeds single quotes via the close-escape-reopen idiom: `'` -> `'\\''`.
+    Paths are normalized to posix form so backslashes do not become escape
+    sequences when libavformat parses the list.
+    """
+    s = p.resolve().as_posix()
+    escaped = s.replace("'", "'\\''")
+    return f"'{escaped}'"
+
+
+def read_srt_text(path: Path) -> str:
+    """Read an SRT with encoding fallback.
+
+    Tries SRT_ENCODINGS in order; returns the first successful decode.
+    Raises SystemExit with a helpful message if none work.
+    """
+    raw = path.read_bytes()
+    last_err: Exception | None = None
+    for enc in SRT_ENCODINGS:
+        try:
+            return raw.decode(enc)
+        except UnicodeDecodeError as e:
+            last_err = e
+            continue
+    raise SystemExit(
+        f"could not decode SRT {path} with any of {SRT_ENCODINGS}: {last_err}"
+    )
+
+
+def make_safe_work_dir(job_name: str, plan_path: Path) -> Path:
+    """Create (or reset) a safe ASCII-named temp dir for one job's intermediates.
+
+    Lives under tempfile.gettempdir() so it never inherits CJK / quote /
+    space characters from the user's project path. Deterministic hash means
+    re-runs land in the same dir for debuggability.
+    """
+    h = hashlib.sha1(
+        f"{plan_path.resolve().as_posix()}|{job_name}".encode("utf-8")
+    ).hexdigest()[:12]
+    p = Path(tempfile.gettempdir()) / f"srt_edit_{h}"
+    if p.exists():
+        shutil.rmtree(p, ignore_errors=True)
+    p.mkdir(parents=True)
+    return p
+
+
+def _path_is_filter_safe(p: Path) -> bool:
+    """Cheap libavfilter-path safety check: ASCII only and no single quotes."""
+    s = str(p)
+    return s.isascii() and "'" not in s
+
+
+def ensure_safe_subs_path(src: Path) -> tuple[Path, Path | None]:
+    """Return (path_to_feed_to_ffmpeg, cleanup_target_or_None).
+
+    If src is already filter-safe, return it as-is and no cleanup target.
+    Otherwise copy to a deterministic ASCII path under the system temp dir
+    and return that, plus a handle the caller should unlink in finally.
+
+    Decoded through read_srt_text so GB18030 / cp936 inputs become UTF-8.
+    """
+    if _path_is_filter_safe(src):
+        return src, None
+    h = hashlib.sha1(src.resolve().as_posix().encode("utf-8")).hexdigest()[:12]
+    safe = Path(tempfile.gettempdir()) / f"srt_burn_{h}.srt"
+    safe.write_text(read_srt_text(src), encoding="utf-8")
+    return safe, safe
+
+
+# ============================================================================
+# Preflight: tool availability + media stream probing
+# ============================================================================
+
+
+_FFMPEG_VERSION_RE = re.compile(r"^ffmpeg version (\S+)")
+_FFPROBE_VERSION_RE = re.compile(r"^ffprobe version (\S+)")
+
+
+def preflight() -> dict[str, str]:
+    """Verify ffmpeg + ffprobe are on PATH and runnable. Return version dict.
+
+    Used both for early failure and to fingerprint cache keys: encoding
+    behavior can shift between ffmpeg versions, so a version bump should
+    invalidate cached clips.
+    """
+    info: dict[str, str] = {}
+    for tool, rx in (("ffmpeg", _FFMPEG_VERSION_RE), ("ffprobe", _FFPROBE_VERSION_RE)):
+        try:
+            r = subprocess.run(
+                [tool, "-version"],
+                capture_output=True, text=True, timeout=10,
+                encoding="utf-8", errors="replace",
+            )
+        except FileNotFoundError:
+            raise SystemExit(
+                f"required tool not on PATH: {tool}. Install ffmpeg first "
+                f"(e.g. `winget install Gyan.FFmpeg` on Windows, "
+                f"`brew install ffmpeg` on macOS)."
+            )
+        except subprocess.TimeoutExpired:
+            raise SystemExit(f"{tool} timed out on `-version`. Bad install?")
+        if r.returncode != 0:
+            raise SystemExit(
+                f"{tool} `-version` exited {r.returncode}: {(r.stderr or '')[:300]}"
+            )
+        first_line = (r.stdout.splitlines() or [""])[0].strip()
+        m = rx.match(first_line)
+        info[tool] = m.group(1) if m else first_line[:40] or "unknown"
+    return info
+
+
+def probe_streams(path: Path) -> dict:
+    """Probe a media file for {has_video, has_audio, duration}.
+
+    Raises SystemExit on any probe failure (binary missing, bad file,
+    malformed output) so the caller doesn't continue blindly. Result
+    is cheap to memoize per source path.
+    """
+    try:
+        r = subprocess.run(
+            [
+                "ffprobe", "-v", "error",
+                "-show_entries", "stream=codec_type",
+                "-show_entries", "format=duration",
+                "-of", "json", str(path),
+            ],
+            capture_output=True, text=True, check=True,
+            encoding="utf-8", errors="replace",
+        )
+    except FileNotFoundError:
+        raise SystemExit(
+            "ffprobe not on PATH. Install ffmpeg "
+            "(`winget install Gyan.FFmpeg` / `brew install ffmpeg`)."
+        )
+    except subprocess.CalledProcessError as e:
+        raise SystemExit(
+            f"ffprobe failed on {path}: {(e.stderr or '')[:300]}"
+        )
+    try:
+        data = json.loads(r.stdout)
+    except json.JSONDecodeError as e:
+        raise SystemExit(f"ffprobe returned malformed JSON for {path}: {e}")
+    types: set[str] = set()
+    for s in data.get("streams", []) or []:
+        t = s.get("codec_type")
+        if t:
+            types.add(t)
+    fmt = data.get("format") or {}
+    try:
+        duration = float(fmt.get("duration", 0.0))
+    except (TypeError, ValueError):
+        duration = 0.0
+    return {
+        "has_video": "video" in types,
+        "has_audio": "audio" in types,
+        "duration": duration,
+    }
+
+
+# ============================================================================
+# Time parsing
+# ============================================================================
+
+
+_TS_RE = re.compile(r"(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})")
+
+
+def parse_timestamp(ts: str) -> float:
+    m = _TS_RE.fullmatch(ts.strip())
+    if not m:
+        raise ValueError(f"bad timestamp: {ts!r}")
+    h, mn, s, ms = m.groups()
+    return int(h) * 3600 + int(mn) * 60 + int(s) + int(ms.ljust(3, "0")) / 1000.0
+
+
+def format_srt_ts(seconds: float) -> str:
+    total_ms = int(round(seconds * 1000))
+    h, rem = divmod(total_ms, 3600_000)
+    m, rem = divmod(rem, 60_000)
+    s, ms = divmod(rem, 1000)
+    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+
+
+# ============================================================================
+# Dataclasses
+# ============================================================================
+
+
+@dataclass
+class SrtCue:
+    id: int
+    final_start: float
+    final_end: float
+    text: str
+
+    @property
+    def duration(self) -> float:
+        return self.final_end - self.final_start
+
+
+@dataclass
+class PlanEntry:
+    id: int
+    source_name: str       # key into sources map (Form A: synthetic "_default")
+    source_start: float
+    source_end: float
+    voice_name: str | None = None  # key into voices map
+
+    @property
+    def duration(self) -> float:
+        return self.source_end - self.source_start
+
+
+@dataclass
+class Segment:
+    id: int
+    source_path: Path
+    source_start: float
+    source_end: float
+    out_start: float
+    out_end: float
+    leading_gap: float
+    text: str
+    voice_path: Path | None
+    pad_short: bool = False
+    plan_src_dur: float = 0.0
+
+    @property
+    def duration(self) -> float:
+        return self.out_end - self.out_start
+
+
+# ============================================================================
+# SRT parser + validation
+# ============================================================================
+
+
+def _split_time_line(line: str) -> tuple[str, str]:
+    """Split an SRT time line into (start_ts, end_ts) strings.
+
+    Tolerates trailing cue settings like 'position:90% align:start' by
+    keeping only the first whitespace-delimited token on each side of '-->'.
+    """
+    parts = line.split("-->", 1)
+    if len(parts) != 2:
+        raise ValueError(f"missing '-->' in time line: {line!r}")
+    left_tokens = parts[0].strip().split()
+    right_tokens = parts[1].strip().split()
+    if not left_tokens or not right_tokens:
+        raise ValueError(f"missing timestamps in time line: {line!r}")
+    return left_tokens[-1], right_tokens[0]
+
+
+def parse_srt(path: Path) -> list[SrtCue]:
+    raw = read_srt_text(path)
+    blocks = re.split(r"\r?\n\r?\n+", raw.strip())
+    cues: list[SrtCue] = []
+    for block in blocks:
+        lines = [ln.rstrip() for ln in block.splitlines() if ln.strip() != ""]
+        if len(lines) < 2:
+            continue
+        try:
+            idx = int(lines[0].strip())
+        except ValueError:
+            raise SystemExit(f"SRT block missing numeric id: {lines[0]!r}")
+        if "-->" not in lines[1]:
+            raise SystemExit(f"SRT block missing time line: {lines[1]!r}")
+        try:
+            a, b = _split_time_line(lines[1])
+            start = parse_timestamp(a)
+            end = parse_timestamp(b)
+        except ValueError as e:
+            raise SystemExit(f"SRT id={lines[0]}: {e}")
+        cues.append(SrtCue(id=idx, final_start=start, final_end=end,
+                           text="\n".join(lines[2:])))
+    return cues
+
+
+def validate_srt(cues: list[SrtCue]) -> None:
+    if not cues:
+        raise SystemExit("SRT has no cues")
+    seen: set[int] = set()
+    for c in cues:
+        if c.id in seen:
+            raise SystemExit(f"SRT duplicate id: {c.id}")
+        seen.add(c.id)
+        if c.final_end <= c.final_start:
+            raise SystemExit(
+                f"SRT id={c.id}: end {c.final_end:.3f} <= start {c.final_start:.3f}"
+            )
+        if c.final_start < 0:
+            raise SystemExit(f"SRT id={c.id}: negative start {c.final_start:.3f}")
+    sorted_cues = sorted(cues, key=lambda x: x.id)
+    for i in range(1, len(sorted_cues)):
+        prev, cur = sorted_cues[i - 1], sorted_cues[i]
+        if cur.final_start < prev.final_start:
+            raise SystemExit(
+                f"SRT non-monotonic by id: id={cur.id} starts at "
+                f"{cur.final_start:.3f}s, earlier than id={prev.id} at "
+                f"{prev.final_start:.3f}s"
+            )
+        if cur.final_start < prev.final_end - 1e-6:
+            raise SystemExit(
+                f"SRT cue overlap: id={prev.id} ends {prev.final_end:.3f}, "
+                f"id={cur.id} starts {cur.final_start:.3f}"
+            )
+
+
+# ============================================================================
+# Plan parser + validation
+# ============================================================================
+
+
+def parse_plan(path: Path) -> tuple[dict[str, Path], dict[str, Path], list[PlanEntry]]:
+    """Returns (sources_map, voices_map, entries). Detects Form A vs B."""
+    try:
+        raw = path.read_text(encoding="utf-8")
+    except OSError as e:
+        raise SystemExit(f"edit_plan unreadable: {path}: {e}")
+    try:
+        data = json.loads(raw)
+    except json.JSONDecodeError as e:
+        raise SystemExit(
+            f"edit_plan is not valid JSON: {path}: "
+            f"line {e.lineno} col {e.colno}: {e.msg}"
+        )
+    base = path.parent
+
+    if isinstance(data, list):
+        entries: list[PlanEntry] = []
+        for row in data:
+            entries.append(PlanEntry(
+                id=int(row["id"]),
+                source_name="_default",
+                source_start=parse_timestamp(row["source_start"]),
+                source_end=parse_timestamp(row["source_end"]),
+                voice_name=None,
+            ))
+        return {}, {}, entries
+
+    if not isinstance(data, dict):
+        raise SystemExit("edit_plan must be a JSON array or object")
+    if "segments" not in data:
+        raise SystemExit("Form B plan missing 'segments' field")
+
+    sources_map: dict[str, Path] = {}
+    for name, p in (data.get("sources") or {}).items():
+        sp = Path(p)
+        if not sp.is_absolute():
+            sp = (base / sp).resolve()
+        sources_map[name] = sp
+
+    voices_map: dict[str, Path] = {}
+    for name, p in (data.get("voices") or {}).items():
+        vp = Path(p)
+        if not vp.is_absolute():
+            vp = (base / vp).resolve()
+        voices_map[name] = vp
+
+    entries = []
+    for row in data["segments"]:
+        entries.append(PlanEntry(
+            id=int(row["id"]),
+            source_name=str(row["source"]),
+            source_start=parse_timestamp(row["source_start"]),
+            source_end=parse_timestamp(row["source_end"]),
+            voice_name=row.get("voice"),
+        ))
+    return sources_map, voices_map, entries
+
+
+def validate_plan(
+    entries: list[PlanEntry],
+    sources_map: dict[str, Path],
+    voices_map: dict[str, Path],
+    legacy_default_source: Path | None,
+) -> None:
+    if not entries:
+        raise SystemExit("edit_plan has no segments")
+    seen: set[int] = set()
+    for e in entries:
+        if e.id in seen:
+            raise SystemExit(f"plan duplicate id: {e.id}")
+        seen.add(e.id)
+        if e.source_start < 0:
+            raise SystemExit(f"plan id={e.id}: negative source_start {e.source_start}")
+        if e.source_end <= e.source_start:
+            raise SystemExit(
+                f"plan id={e.id}: source_end {e.source_end:.3f} <= "
+                f"source_start {e.source_start:.3f}"
+            )
+        if e.source_name == "_default":
+            if legacy_default_source is None:
+                raise SystemExit(
+                    "Form A plan requires --source <path> at the CLI"
+                )
+        else:
+            if e.source_name not in sources_map:
+                raise SystemExit(
+                    f"plan id={e.id}: source '{e.source_name}' not in sources map"
+                )
+        if e.voice_name is not None and e.voice_name not in voices_map:
+            raise SystemExit(
+                f"plan id={e.id}: voice '{e.voice_name}' not in voices map"
+            )
+    for name, sp in sources_map.items():
+        if not sp.exists():
+            raise SystemExit(f"source '{name}' missing on disk: {sp}")
+    for name, vp in voices_map.items():
+        if not vp.exists():
+            raise SystemExit(f"voice '{name}' missing on disk: {vp}")
+    if legacy_default_source is not None and not legacy_default_source.exists():
+        raise SystemExit(f"--source missing on disk: {legacy_default_source}")
+
+
+def validate_alignment(cues: list[SrtCue], entries: list[PlanEntry]) -> None:
+    cue_ids = {c.id for c in cues}
+    plan_ids = {e.id for e in entries}
+    if cue_ids != plan_ids:
+        only_srt = cue_ids - plan_ids
+        only_plan = plan_ids - cue_ids
+        msg = []
+        if only_srt:
+            msg.append(f"in SRT but not in plan: {sorted(only_srt)}")
+        if only_plan:
+            msg.append(f"in plan but not in SRT: {sorted(only_plan)}")
+        raise SystemExit("id mismatch: " + "; ".join(msg))
+
+
+# ============================================================================
+# Alignment
+# ============================================================================
+
+
+def align(
+    cues: list[SrtCue],
+    entries: list[PlanEntry],
+    sources_map: dict[str, Path],
+    voices_map: dict[str, Path],
+    legacy_default_source: Path | None,
+    tolerance: float,
+    trim_direction: str,
+    on_short: str,
+) -> list[Segment]:
+    cue_by_id = {c.id: c for c in cues}
+    plan_by_id = {e.id: e for e in entries}
+
+    segments: list[Segment] = []
+    prev_out_end = 0.0
+    for cid in sorted(cue_by_id):
+        cue = cue_by_id[cid]
+        pln = plan_by_id[cid]
+        src_dur = pln.duration
+        target = cue.duration
+
+        pad_short = False
+        if src_dur + tolerance < target:
+            short_by = target - src_dur
+            if on_short == "error":
+                raise SystemExit(
+                    f"id={cid}: source is {short_by:.3f}s shorter than SRT target "
+                    f"({src_dur:.3f}s vs {target:.3f}s). Pass --on-short=pad to "
+                    f"freeze-pad the tail, or extend the source range."
+                )
+            pad_short = True
+            src_start = pln.source_start
+            src_end = pln.source_end
+        elif src_dur > target + tolerance:
+            if trim_direction == "tail":
+                src_start = pln.source_start
+                src_end = pln.source_start + target
+            elif trim_direction == "head":
+                src_start = pln.source_end - target
+                src_end = pln.source_end
+            elif trim_direction == "center":
+                overhang = (src_dur - target) / 2
+                src_start = pln.source_start + overhang
+                src_end = pln.source_end - overhang
+            else:
+                raise ValueError(f"unknown trim_direction: {trim_direction}")
+        else:
+            src_start = pln.source_start
+            src_end = pln.source_start + target
+
+        if pln.source_name == "_default":
+            assert legacy_default_source is not None
+            source_path = legacy_default_source
+        else:
+            source_path = sources_map[pln.source_name]
+
+        voice_path = voices_map[pln.voice_name] if pln.voice_name else None
+        gap = max(0.0, cue.final_start - prev_out_end)
+        segments.append(Segment(
+            id=cid,
+            source_path=source_path,
+            source_start=src_start,
+            source_end=src_end,
+            out_start=cue.final_start,
+            out_end=cue.final_end,
+            leading_gap=gap,
+            text=cue.text,
+            voice_path=voice_path,
+            pad_short=pad_short,
+            plan_src_dur=src_dur,
+        ))
+        prev_out_end = cue.final_end
+
+    return segments
+
+
+# ============================================================================
+# Style resolution
+# ============================================================================
+
+
+def has_cjk(cues: list[SrtCue]) -> bool:
+    return any(CJK_RE.search(c.text) for c in cues)
+
+
+def resolve_style(style_arg: str, cues: list[SrtCue]) -> str:
+    if style_arg == "auto":
+        return STYLE_TEMPLATES["cjk-natural" if has_cjk(cues) else "bold-uppercase"]
+    if style_arg in STYLE_TEMPLATES:
+        return STYLE_TEMPLATES[style_arg]
+    if "=" in style_arg:
+        return style_arg
+    raise SystemExit(
+        f"unknown style: {style_arg!r}. Known templates: "
+        f"{sorted(STYLE_TEMPLATES)}. Pass a raw ASS string with '=' to override."
+    )
+
+
+# ============================================================================
+# Clip cache
+# ============================================================================
+
+
+def _file_fingerprint(path: Path) -> tuple[int, int]:
+    st = path.stat()
+    return (int(st.st_mtime_ns), st.st_size)
+
+
+def cache_key(seg: Segment, effective_bg_volume: float, hdr: bool,
+              portrait: bool, voice_signature: tuple | None,
+              ffmpeg_version: str) -> str:
+    fp = _file_fingerprint(seg.source_path)
+    payload = json.dumps([
+        CACHE_VERSION,
+        str(seg.source_path.resolve()), fp[0], fp[1],
+        round(seg.source_start, 4), round(seg.source_end, 4),
+        round(seg.duration, 4),
+        round(effective_bg_volume, 4),
+        hdr, portrait,
+        seg.pad_short, round(seg.plan_src_dur, 4),
+        PARAMS_FINGERPRINT,
+        ffmpeg_version,
+        voice_signature,
+    ], sort_keys=True)
+    return hashlib.sha256(payload.encode()).hexdigest()[:32]
+
+
+def cache_lookup(cache_dir: Path, key: str) -> Path | None:
+    p = cache_dir / f"{key}.mp4"
+    return p if p.exists() else None
+
+
+def cache_store(cache_dir: Path, key: str, clip_path: Path) -> None:
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(clip_path, cache_dir / f"{key}.mp4")
+
+
+# ============================================================================
+# ffmpeg orchestration
+# ============================================================================
+
+
+class PipelineError(SystemExit):
+    """SystemExit subclass carrying ffmpeg stderr context for diagnostics.
+
+    Batch loops pattern-match on `stderr_tail` to write a richer failure
+    record. Plain SystemExit raised by pre-flight / validation code keeps
+    working — callers use `getattr(e, 'stderr_tail', '')` so both branches
+    of `try/except SystemExit` flow through the same handler.
+    """
+    def __init__(self, message: str, *, stderr_tail: str = ""):
+        super().__init__(message)
+        self.stderr_tail = stderr_tail
+
+
+def _tail_text(s: str, *, max_lines: int = 30, max_chars: int = 2000) -> str:
+    """Return the last `max_lines` of `s`, capped at `max_chars`.
+
+    Used to attach a readable slice of ffmpeg's stderr to PipelineError —
+    enough to diagnose, not so much that batch summaries balloon.
+    """
+    if not s:
+        return ""
+    lines = s.strip().splitlines()
+    tail = "\n".join(lines[-max_lines:])
+    if len(tail) > max_chars:
+        tail = "...[truncated]...\n" + tail[-(max_chars - 22):]
+    return tail
+
+
+def run_ff(cmd: list[str], desc: str) -> None:
+    print(f"  $ {desc}")
+    proc = subprocess.run(cmd, capture_output=True, text=True,
+                          encoding="utf-8", errors="replace")
+    if proc.returncode != 0:
+        # Stream raw stderr to the console so an interactive user sees the
+        # failure live; also attach a bounded tail to the exception so a
+        # batch summary can capture diagnostic context without keeping the
+        # full stderr in memory or in the JSON.
+        sys.stderr.write(proc.stderr or "")
+        raise PipelineError(
+            f"ffmpeg failed: {desc}",
+            stderr_tail=_tail_text(proc.stderr or ""),
+        )
+
+
+def probe_duration(path: Path) -> float:
+    out = subprocess.run(
+        ["ffprobe", "-v", "error", "-show_entries", "format=duration",
+         "-of", "default=noprint_wrappers=1:nokey=1", str(path)],
+        capture_output=True, text=True, check=True,
+    )
+    return float(out.stdout.strip())
+
+
+def scale_filter_for(source: Path) -> str:
+    return "scale=-2:1920" if is_portrait_source(source) else "scale=1920:-2"
+
+
+def _voice_signature(voice_path: Path | None, target: float) -> tuple | None:
+    if voice_path is None:
+        return None
+    fp = _file_fingerprint(voice_path)
+    return (str(voice_path.resolve()), fp[0], fp[1], round(target, 4))
+
+
+def extract_segment(
+    seg: Segment,
+    out_path: Path,
+    bg_volume: float,
+) -> None:
+    """Extract one segment to 1080p 24fps with audio resolved per-segment.
+
+    `bg_volume` here is the EFFECTIVE level — callers must already have
+    zeroed it for sources whose ffprobe says there is no audio track.
+
+    Audio resolution:
+      voice_path present + bg_volume > 0  → mix voice + source*bg
+      voice_path present + bg_volume == 0 → voice only
+      voice_path absent  + bg_volume > 0  → source audio at bg_volume (fades)
+      voice_path absent  + bg_volume == 0 → silent
+    """
+    keep_audio_from_source = bg_volume > 0.0
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    target = seg.duration
+
+    vf_parts: list[str] = []
+    if is_hdr_source(seg.source_path):
+        vf_parts.append(TONEMAP_CHAIN)
+    vf_parts.append(scale_filter_for(seg.source_path))
+
+    if seg.pad_short and seg.plan_src_dur + 1e-6 < target:
+        vf_parts.append(
+            f"tpad=stop_mode=clone:stop_duration={target - seg.plan_src_dur:.3f}"
+        )
+        v_input_dur = seg.plan_src_dur
+    else:
+        v_input_dur = target
+
+    vf_parts.append(V_SYNC_TAIL)
+    vf = ",".join(vf_parts)
+
+    inputs: list[str] = [
+        "-ss", f"{seg.source_start:.3f}",
+        "-i", str(seg.source_path),
+        "-t", f"{v_input_dur:.3f}",
+    ]
+
+    has_voice = seg.voice_path is not None
+    voice_index: int | None = None
+    if has_voice:
+        voice_index = 1
+        inputs += ["-i", str(seg.voice_path)]
+
+    # Audio filter graph — applied via -filter_complex when we have voice,
+    # otherwise simple -af on source audio.
+    audio_args: list[str] = []
+    if has_voice and bg_volume <= 0.0:
+        fade_out = max(0.0, target - 0.03)
+        ac_parts = [
+            f"[{voice_index}:a]apad=whole_dur={target:.3f},"
+            f"atrim=duration={target:.3f},asetpts=PTS-STARTPTS,"
+            f"afade=t=in:st=0:d=0.03,"
+            f"afade=t=out:st={fade_out:.3f}:d=0.03,"
+            f"{A_SYNC_TAIL}[outa]"
+        ]
+        audio_args = ["-filter_complex", ";".join(ac_parts),
+                      "-map", "[outa]"]
+    elif has_voice and bg_volume > 0.0:
+        fade_out = max(0.0, target - 0.03)
+        ac_parts = [
+            f"[{voice_index}:a]apad=whole_dur={target:.3f},"
+            f"atrim=duration={target:.3f},asetpts=PTS-STARTPTS[voice]",
+            f"[0:a]volume={bg_volume:.3f},"
+            f"afade=t=in:st=0:d=0.03,afade=t=out:st={fade_out:.3f}:d=0.03[bg]",
+            f"[voice][bg]amix=inputs=2:duration=first:normalize=0,"
+            f"{A_SYNC_TAIL}[outa]",
+        ]
+        audio_args = ["-filter_complex", ";".join(ac_parts),
+                      "-map", "[outa]"]
+    elif not has_voice and keep_audio_from_source:
+        fade_out = max(0.0, target - 0.03)
+        af = (
+            f"volume={bg_volume:.3f},"
+            f"afade=t=in:st=0:d=0.03,afade=t=out:st={fade_out:.3f}:d=0.03,"
+            f"{A_SYNC_TAIL}"
+        )
+        if seg.pad_short and seg.plan_src_dur + 1e-6 < target:
+            af = f"apad=whole_dur={target:.3f},{af}"
+        audio_args = ["-af", af, "-map", "0:a"]
+    else:
+        # silent track via lavfi so concat inputs share an audio stream
+        inputs += [
+            "-f", "lavfi", "-t", f"{target:.3f}",
+            "-i", f"anullsrc=channel_layout=stereo:sample_rate={SAMPLE_RATE}",
+        ]
+        silent_idx = 2 if has_voice else 1
+        audio_args = ["-af", A_SYNC_TAIL, "-map", f"{silent_idx}:a"]
+
+    cmd: list[str] = [
+        "ffmpeg", "-y", "-hide_banner", "-nostats",
+        *inputs,
+        "-vf", vf, "-r", str(FPS),
+        "-map", "0:v",
+        *audio_args,
+        "-c:v", "libx264", "-preset", "fast", "-crf", "20",
+        "-pix_fmt", "yuv420p",
+        "-c:a", "aac", "-b:a", AUDIO_BITRATE, "-ar", str(SAMPLE_RATE), "-ac", "2",
+        "-t", f"{target:.3f}",
+        "-movflags", "+faststart",
+        str(out_path),
+    ]
+    run_ff(cmd, f"extract id={seg.id}  src[{seg.source_start:.2f}-{seg.source_end:.2f}] → {out_path.name}")
+
+
+def make_gap_clip(duration: float, portrait: bool, out_path: Path) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    size = "1080x1920" if portrait else "1920x1080"
+    cmd = [
+        "ffmpeg", "-y", "-hide_banner", "-nostats",
+        "-f", "lavfi", "-i", f"color=c=black:s={size}:r={FPS}:d={duration:.3f}",
+        "-f", "lavfi", "-i",
+        f"anullsrc=channel_layout=stereo:sample_rate={SAMPLE_RATE}",
+        "-t", f"{duration:.3f}",
+        "-vf", V_SYNC_TAIL,
+        "-af", A_SYNC_TAIL,
+        "-c:v", "libx264", "-preset", "fast", "-crf", "20",
+        "-pix_fmt", "yuv420p", "-r", str(FPS),
+        "-c:a", "aac", "-b:a", AUDIO_BITRATE, "-ar", str(SAMPLE_RATE),
+        "-movflags", "+faststart",
+        str(out_path),
+    ]
+    run_ff(cmd, f"gap {duration:.3f}s → {out_path.name}")
+
+
+def concat_clips(clip_paths: list[Path], out_path: Path, work_dir: Path) -> None:
+    """Concat losslessly via the demuxer. work_dir is assumed safe-ASCII.
+
+    Each line is `file <quoted-path>` with the quoting routine that handles
+    spaces, single quotes, and CJK. Callers should register the list file
+    for cleanup BEFORE this is invoked so a mid-write failure still cleans up.
+    """
+    concat_list = work_dir / "_concat_srt_driven.txt"
+    lines = [f"file {concat_quote_path(p)}\n" for p in clip_paths]
+    concat_list.write_text("".join(lines), encoding="utf-8")
+    cmd = [
+        "ffmpeg", "-y", "-hide_banner", "-nostats",
+        "-f", "concat", "-safe", "0",
+        "-i", str(concat_list),
+        "-c", "copy",
+        "-movflags", "+faststart",
+        str(out_path),
+    ]
+    run_ff(cmd, f"concat {len(clip_paths)} clips → {out_path.name}")
+
+
+def burn_subtitles(
+    base_path: Path,
+    subs_path: Path,
+    style: str,
+    fontsdir: Path | None,
+    out_path: Path,
+    *,
+    global_voice: Path | None = None,
+    total_duration: float = 0.0,
+) -> None:
+    """Final pass: optional global-voice mix + subtitle burn (LAST).
+
+    Self-defending on subs_path: if not filter-safe, copied to a deterministic
+    temp SRT first so libavfilter never sees the problematic original.
+    fontsdir, if given, must already be filter-safe — we error rather than
+    copy an entire font directory.
+
+    Audio handling:
+      - global_voice is None: pass base audio through (`-c:a copy`).
+      - global_voice given: voice is apad'd / atrim'd to exactly total_duration
+        so it spans the entire output timeline, then mixed on top of base's
+        audio. Base already contains source*bg_volume (or silence) from
+        extract_segment, so we do NOT re-scale it here — that would double-
+        attenuate the background. amix uses duration=first so the result
+        runs exactly total_duration; normalize=0 keeps levels predictable.
+    """
+    if fontsdir is not None and not _path_is_filter_safe(fontsdir):
+        raise SystemExit(
+            f"fontsdir contains non-ASCII or single-quote characters; "
+            f"move it to a safe ASCII path first: {fontsdir}"
+        )
+
+    safe_subs, cleanup_target = ensure_safe_subs_path(subs_path)
+    try:
+        subs_arg = subs_filter_escape(safe_subs)
+        style_escaped = style.replace("'", r"\'")
+        if fontsdir is not None:
+            fd = subs_filter_escape(fontsdir)
+            subs_filter = f"subtitles='{subs_arg}':fontsdir='{fd}':force_style='{style_escaped}'"
+        else:
+            subs_filter = f"subtitles='{subs_arg}':force_style='{style_escaped}'"
+
+        if global_voice is None:
+            # No audio work — just burn subtitles, copy audio.
+            cmd = [
+                "ffmpeg", "-y", "-hide_banner", "-nostats",
+                "-i", str(base_path),
+                "-vf", subs_filter,
+                "-c:v", "libx264", "-preset", "fast", "-crf", "18",
+                "-pix_fmt", "yuv420p",
+                "-c:a", "copy",
+                "-movflags", "+faststart",
+                str(out_path),
+            ]
+            label = f"subtitle burn (LAST) → {out_path.name}"
+        else:
+            if total_duration <= 0.0:
+                raise SystemExit(
+                    "burn_subtitles: total_duration must be > 0 when global_voice is set"
+                )
+            voice_chain = (
+                f"[1:a]apad=whole_dur={total_duration:.3f},"
+                f"atrim=duration={total_duration:.3f},"
+                f"asetpts=PTS-STARTPTS,"
+                f"{A_SYNC_TAIL}"
+            )
+            # base [0:a] already contains source*bg_volume from extract; do NOT
+            # apply bg_volume again here. amix combines voice + existing base
+            # audio (which is silent on gaps and on segments with bg_volume=0).
+            filter_complex = (
+                f"[0:v]{subs_filter}[outv];"
+                f"{voice_chain}[voice];"
+                f"[voice][0:a]amix=inputs=2:duration=first:normalize=0[outa]"
+            )
+            cmd = [
+                "ffmpeg", "-y", "-hide_banner", "-nostats",
+                "-i", str(base_path),
+                "-i", str(global_voice),
+                "-filter_complex", filter_complex,
+                "-map", "[outv]", "-map", "[outa]",
+                "-c:v", "libx264", "-preset", "fast", "-crf", "18",
+                "-pix_fmt", "yuv420p",
+                "-c:a", "aac", "-b:a", AUDIO_BITRATE, "-ar", str(SAMPLE_RATE),
+                "-movflags", "+faststart",
+                str(out_path),
+            ]
+            label = f"subtitle burn (LAST) + global voice mix → {out_path.name}"
+
+        run_ff(cmd, label)
+    finally:
+        if cleanup_target is not None:
+            try:
+                cleanup_target.unlink()
+            except OSError:
+                pass
+
+
+# ============================================================================
+# EDL + QC artifacts
+# ============================================================================
+
+
+def write_edl(segments: list[Segment], srt: Path, plan: Path,
+              bg_volume: float, style_name: str, out_path: Path) -> None:
+    edl = {
+        "version": "srt-driven-2",
+        "script_srt": str(srt.resolve()),
+        "plan": str(plan.resolve()),
+        "bg_volume": bg_volume,
+        "style": style_name,
+        "segments": [
+            {
+                "id": s.id,
+                "source": str(s.source_path.resolve()),
+                "source_start": format_srt_ts(s.source_start),
+                "source_end": format_srt_ts(s.source_end),
+                "out_start": format_srt_ts(s.out_start),
+                "out_end": format_srt_ts(s.out_end),
+                "duration": round(s.duration, 3),
+                "leading_gap": round(s.leading_gap, 3),
+                "voice": str(s.voice_path.resolve()) if s.voice_path else None,
+                "pad_short": s.pad_short,
+                "text": s.text,
+            }
+            for s in segments
+        ],
+        "total_duration_s": round(segments[-1].out_end, 3) if segments else 0.0,
+    }
+    out_path.write_text(json.dumps(edl, indent=2, ensure_ascii=False), encoding="utf-8")
+    print(f"  EDL → {out_path.name}")
+
+
+def _dir_size(path: Path) -> int:
+    if not path.exists():
+        return 0
+    total = 0
+    for p in path.rglob("*"):
+        if p.is_file():
+            total += p.stat().st_size
+    return total
+
+
+def build_qc_report(
+    job_name: str,
+    segments: list[Segment],
+    seg_clip_info: list[dict],
+    output_path: Path,
+    expected_duration: float,
+    style_name: str,
+    style_resolved: str,
+    bg_volume: float,
+    has_any_voice: bool,
+    elapsed_s: float,
+    edit_dir: Path,
+    work_dir: Path,
+    cache_dir: Path,
+    out_qc_path: Path,
+) -> dict:
+    actual_dur = probe_duration(output_path)
+    drift_ms = round((actual_dur - expected_duration) * 1000)
+
+    audio_mode = (
+        "voice_replace" if has_any_voice and bg_volume <= 0.0
+        else "voice_mix" if has_any_voice
+        else "original_only" if bg_volume > 0.0
+        else "silent"
+    )
+
+    seg_records = []
+    for seg, info in zip(segments, seg_clip_info):
+        actual_seg = probe_duration(info["clip_path"]) if Path(info["clip_path"]).exists() else 0.0
+        seg_records.append({
+            "id": seg.id,
+            "expected_duration_s": round(seg.duration, 3),
+            "actual_duration_s": round(actual_seg, 3),
+            "drift_ms": round((actual_seg - seg.duration) * 1000),
+            "cached": info["cached"],
+            "clip_size_bytes": Path(info["clip_path"]).stat().st_size if Path(info["clip_path"]).exists() else 0,
+            "source": str(seg.source_path),
+            "voice": str(seg.voice_path) if seg.voice_path else None,
+        })
+
+    clips_size = sum(s["clip_size_bytes"] for s in seg_records)
+    final_size = output_path.stat().st_size
+    cache_size = _dir_size(cache_dir)
+    work_dir_size = _dir_size(work_dir)
+
+    report = {
+        "job": job_name,
+        "ok": abs(actual_dur - expected_duration) <= DURATION_DRIFT_TOLERANCE_S,
+        "elapsed_s": round(elapsed_s, 2),
+        "duration": {
+            "expected_s": round(expected_duration, 3),
+            "actual_s": round(actual_dur, 3),
+            "drift_ms": drift_ms,
+            "tolerance_ms": int(DURATION_DRIFT_TOLERANCE_S * 1000),
+            "within_tolerance": abs(actual_dur - expected_duration) <= DURATION_DRIFT_TOLERANCE_S,
+        },
+        "segments": seg_records,
+        "subtitles": {
+            "applied": True,
+            "style_name": style_name,
+            "force_style": style_resolved,
+            "cue_count": len(segments),
+        },
+        "audio": {
+            "mode": audio_mode,
+            "bg_volume": bg_volume,
+            "voice_used": has_any_voice,
+        },
+        "disk_usage_bytes": {
+            "work_dir_total": work_dir_size,
+            "clips_in_work_dir": clips_size,
+            "final_output": final_size,
+            "cache": cache_size,
+        },
+        "output_path": str(output_path),
+    }
+    out_qc_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
+    print(f"  QC report → {out_qc_path.name}")
+    return report
+
+
+# ============================================================================
+# Single-job runner
+# ============================================================================
+
+
+@dataclass
+class Job:
+    source: Path | None         # legacy single-source path; None if Form B
+    srt: Path
+    plan: Path
+    voice: Path | None          # global voice override (mutually exclusive with per-segment)
+    bg_volume: float
+    tolerance: float
+    trim_direction: str
+    on_short: str
+    style: str
+    fontsdir: Path | None
+    output: Path | None
+    name: str
+    no_cache: bool
+    keep_intermediates: bool
+    no_overwrite: bool = False
+    mode: str = "full"          # "full" (default) | "extract" (stop after segments)
+
+
+def run_job(job: Job, ffmpeg_version: str) -> dict:
+    t0 = time.time()
+    print(f"\n== job: {job.name} ==")
+
+    cues = parse_srt(job.srt)
+    validate_srt(cues)
+    sources_map, voices_map, entries = parse_plan(job.plan)
+
+    legacy_source: Path | None = None
+    if sources_map:
+        if job.source is not None:
+            print("  note: --source ignored (plan defines its own sources)")
+    else:
+        if job.source is None:
+            raise SystemExit("Form A plan needs --source <path>")
+        legacy_source = job.source.resolve()
+
+    has_per_seg_voice = any(e.voice_name for e in entries)
+    if job.voice is not None and has_per_seg_voice:
+        raise SystemExit(
+            "voice conflict: --voice given AND plan contains per-segment voices. "
+            "Pick one."
+        )
+
+    # Global voice is NOT expanded into per-segment entries. Per-segment voices
+    # play during their segment's window; a global voice spans the entire
+    # output timeline and is mixed in during the final compose step. Doing it
+    # at extract time would replay voice[0:seg_dur] for every segment, which
+    # is wrong for any voice longer than one segment.
+    global_voice: Path | None = job.voice
+    if global_voice is not None:
+        v_info = probe_streams(global_voice)
+        if not v_info["has_audio"]:
+            raise SystemExit(f"global --voice file has no audio track: {global_voice}")
+        print(f"  global voice: {global_voice.name} ({v_info['duration']:.3f}s)")
+
+    validate_plan(entries, sources_map, voices_map, legacy_source)
+    validate_alignment(cues, entries)
+
+    # Probe every source once. Cache by Path to avoid repeat ffprobe calls
+    # when many segments share a source.
+    unique_sources: dict[str, Path] = {}
+    if legacy_source is not None:
+        unique_sources["_default"] = legacy_source
+    for name, p in sources_map.items():
+        unique_sources[name] = p
+
+    source_info: dict[str, dict] = {}
+    source_info_by_path: dict[Path, dict] = {}
+    print("  probing sources:")
+    for name, p in unique_sources.items():
+        info = probe_streams(p)
+        source_info[name] = info
+        source_info_by_path[p] = info
+        print(f"    {name}: video={info['has_video']} audio={info['has_audio']} "
+              f"duration={info['duration']:.3f}s")
+        if not info["has_video"]:
+            raise SystemExit(f"source '{name}' has no video stream: {p}")
+
+    # Range bounds — fail fast rather than letting ffmpeg fail mid-batch.
+    for e in entries:
+        info = source_info[e.source_name]
+        if e.source_end > info["duration"] + job.tolerance:
+            raise SystemExit(
+                f"plan id={e.id}: source_end {e.source_end:.3f}s exceeds "
+                f"source '{e.source_name}' duration {info['duration']:.3f}s "
+                f"(tolerance ±{job.tolerance}s)"
+            )
+
+    # Effective bg_volume per source: if source has no audio track, force to 0
+    # rather than letting ffmpeg fail on a missing 0:a stream reference.
+    no_audio_names = [n for n, info in source_info.items() if not info["has_audio"]]
+    if no_audio_names and job.bg_volume > 0.0:
+        print(f"  WARNING: source(s) {no_audio_names} have no audio track — "
+              f"bg_volume forced to 0 for segments from them")
+
+    segments = align(
+        cues, entries, sources_map, voices_map, legacy_source,
+        tolerance=job.tolerance, trim_direction=job.trim_direction,
+        on_short=job.on_short,
+    )
+
+    edit_dir = (job.output.parent if job.output else job.plan.parent / "edit")
+    edit_dir.mkdir(parents=True, exist_ok=True)
+    out_path = job.output.resolve() if job.output else (
+        edit_dir / f"final_srt_driven_{safe_ascii_name(job.name)}.mp4"
+    )
+
+    # Output-overwrite check only matters in modes that actually produce
+    # final output. Extract mode stops before any out_path is written, so
+    # checking it would produce spurious warnings about an unrelated file.
+    if job.mode == "full" and out_path.exists():
+        if job.no_overwrite:
+            raise SystemExit(f"output exists and --no-overwrite set: {out_path}")
+        print(f"  WARNING: overwriting existing output: {out_path}")
+
+    style_resolved = resolve_style(job.style, cues)
+    print(f"  style: {job.style} ({len(cues)} cues, cjk={has_cjk(cues)})  mode={job.mode}")
+
+    # All intermediates live in a safe-ASCII temp dir under tempfile.gettempdir().
+    # Wiped at start so a previous crashed run cannot pollute. Wiped at end
+    # (in finally) unless --keep-intermediates is set.
+    work_dir = make_safe_work_dir(job.name, job.plan)
+    print(f"  work dir: {work_dir}")
+
+    try:
+        # SRT normalized to UTF-8 with encoding fallback (handles GB18030 input).
+        # Lives in the safe work dir so its path is guaranteed friendly to libass.
+        safe_subs = work_dir / "subs.srt"
+        safe_subs.write_text(read_srt_text(job.srt), encoding="utf-8")
+
+        edl_path = edit_dir / f"edl_srt_driven_{safe_ascii_name(job.name)}.json"
+        write_edl(segments, job.srt, job.plan, job.bg_volume, job.style, edl_path)
+
+        clips_dir = work_dir / "clips"
+        clips_dir.mkdir(parents=True, exist_ok=True)
+        cache_dir = edit_dir / "cache_srt_driven"
+
+        portrait = is_portrait_source(segments[0].source_path)
+
+        clip_paths: list[Path] = []
+        seg_clip_info: list[dict] = []
+        any_voice = any(s.voice_path is not None for s in segments)
+
+        print(f"\n  extracting {len(segments)} segments  cache={'off' if job.no_cache else 'on'}  voice={'per-seg' if any_voice else 'none'}")
+        for i, seg in enumerate(segments):
+            # Gap clips are a concat-time concept (synthetic black + silence
+            # bridging non-contiguous SRT cues). Extract mode emits only the
+            # real source segments, so skip gap clips entirely there.
+            if job.mode != "extract" and seg.leading_gap > 0.001:
+                gap_path = clips_dir / f"gap_{i:02d}_{seg.leading_gap:.3f}.mp4"
+                if not gap_path.exists():
+                    make_gap_clip(seg.leading_gap, portrait, gap_path)
+                clip_paths.append(gap_path)
+
+            seg_path = clips_dir / f"seg_{i:02d}_id{seg.id}.mp4"
+            voice_sig = _voice_signature(seg.voice_path, seg.duration)
+
+            # Effective bg_volume for THIS segment: forced to 0 if its source
+            # has no audio track. Keeps ffmpeg from referencing a missing 0:a.
+            src_has_audio = source_info_by_path[seg.source_path]["has_audio"]
+            effective_bg = job.bg_volume if src_has_audio else 0.0
+
+            ck = cache_key(
+                seg,
+                effective_bg_volume=effective_bg,
+                hdr=is_hdr_source(seg.source_path),
+                portrait=portrait,
+                voice_signature=voice_sig,
+                ffmpeg_version=ffmpeg_version,
+            ) if not job.no_cache else None
+
+            cached_hit = False
+            if ck and (hit := cache_lookup(cache_dir, ck)) is not None:
+                shutil.copy2(hit, seg_path)
+                print(f"  [cache hit] id={seg.id} → {seg_path.name}")
+                cached_hit = True
+            else:
+                extract_segment(seg, seg_path, bg_volume=effective_bg)
+                if ck:
+                    cache_store(cache_dir, ck, seg_path)
+
+            clip_paths.append(seg_path)
+            seg_clip_info.append({"clip_path": str(seg_path), "cached": cached_hit})
+
+        # ---- Extract mode: copy clips to a persistent location and stop ----
+        if job.mode == "extract":
+            extracted_dir = edit_dir / f"extracted_clips_{safe_ascii_name(job.name)}"
+            extracted_dir.mkdir(parents=True, exist_ok=True)
+            # Wipe stale clips from a prior run so the dir reflects only this
+            # run's segments — same pattern as srt_video_editor.py.
+            for stale in extracted_dir.glob("clip_*.mp4"):
+                stale.unlink()
+            copied: list[dict] = []
+            for seg, info in zip(segments, seg_clip_info):
+                src = Path(info["clip_path"])
+                if not src.exists():
+                    continue
+                dst = extracted_dir / f"clip_{seg.id:03d}.mp4"
+                shutil.copy2(src, dst)
+                copied.append({
+                    "id": seg.id,
+                    "filename": dst.name,
+                    "expected_duration_s": round(seg.duration, 3),
+                    "cached_from_prev_run": info["cached"],
+                })
+            print(f"\n=== extract mode: stopping after segment extraction ===")
+            print(f"  {len(copied)} clip(s) saved to: {extracted_dir}/")
+            for c in copied:
+                print(f"    {c['filename']:<24} "
+                      f"({c['expected_duration_s']:.3f}s)"
+                      + ("  [cache hit]" if c["cached_from_prev_run"] else ""))
+            return {
+                "job": job.name,
+                "ok": True,
+                "mode": "extract",
+                "extracted_dir": str(extracted_dir),
+                "clip_count": len(copied),
+                "segments": copied,
+                "elapsed_s": round(time.time() - t0, 2),
+            }
+        # ---- Full mode continues to concat + compose ----
+
+        base_path = work_dir / "base.mp4"
+        concat_clips(clip_paths, base_path, work_dir)
+
+        total_duration = segments[-1].out_end
+        burn_subtitles(
+            base_path, safe_subs, style_resolved, job.fontsdir, out_path,
+            global_voice=global_voice,
+            total_duration=total_duration,
+        )
+
+        # QC voice flag must reflect EITHER per-segment OR global voice usage.
+        voice_used = any_voice or (global_voice is not None)
+
+        qc_path = edit_dir / f"qc_report_{safe_ascii_name(job.name)}.json"
+        qc_report = build_qc_report(
+            job_name=job.name,
+            segments=segments,
+            seg_clip_info=seg_clip_info,
+            output_path=out_path,
+            expected_duration=total_duration,
+            style_name=job.style,
+            style_resolved=style_resolved,
+            bg_volume=job.bg_volume,
+            has_any_voice=voice_used,
+            elapsed_s=time.time() - t0,
+            edit_dir=edit_dir,
+            work_dir=work_dir,
+            cache_dir=cache_dir,
+            out_qc_path=qc_path,
+        )
+        print(f"\n  done in {qc_report['elapsed_s']}s, drift={qc_report['duration']['drift_ms']}ms")
+        return qc_report
+
+    finally:
+        if job.keep_intermediates:
+            print(f"  intermediates kept at: {work_dir}")
+        else:
+            shutil.rmtree(work_dir, ignore_errors=True)
+
+
+# ============================================================================
+# Batch manifest
+# ============================================================================
+
+
+def make_failure_record(
+    *,
+    index: int,
+    name: str,
+    error: BaseException,
+    job: "Job | None" = None,
+    manifest_row: dict | None = None,
+) -> dict:
+    """Build a diagnostic failure entry for a batch summary.
+
+    Shape: `{job, ok=False, index, error, stderr_tail, srt, plan, source, output}`.
+
+    `stderr_tail` is non-empty only for `PipelineError` (i.e. ffmpeg failures);
+    plain `SystemExit` from validation paths leaves it as "". When `job` is
+    provided, paths come from the resolved Job; otherwise they fall back to
+    the raw manifest_row dict so rows that crash inside `job_from_dict`
+    still get useful context.
+    """
+    stderr_tail = ""
+    if isinstance(error, PipelineError):
+        stderr_tail = error.stderr_tail or ""
+
+    if job is not None:
+        srt = str(job.srt) if job.srt else None
+        plan = str(job.plan) if job.plan else None
+        source = str(job.source) if job.source else None
+        output = str(job.output) if job.output else None
+    elif manifest_row is not None:
+        srt = manifest_row.get("srt")
+        plan = manifest_row.get("plan")
+        source = manifest_row.get("source")
+        output = manifest_row.get("output")
+    else:
+        srt = plan = source = output = None
+
+    return {
+        "job": name,
+        "ok": False,
+        "index": index,
+        "error": str(error),
+        "stderr_tail": stderr_tail,
+        "srt": srt,
+        "plan": plan,
+        "source": source,
+        "output": output,
+    }
+
+
+def load_manifest(path: Path) -> list[dict]:
+    suffix = path.suffix.lower()
+    if suffix == ".json":
+        try:
+            raw = path.read_text(encoding="utf-8")
+        except OSError as e:
+            raise SystemExit(f"batch manifest unreadable: {path}: {e}")
+        try:
+            data = json.loads(raw)
+        except json.JSONDecodeError as e:
+            raise SystemExit(
+                f"batch manifest is not valid JSON: {path}: "
+                f"line {e.lineno} col {e.colno}: {e.msg}"
+            )
+        if not isinstance(data, list):
+            raise SystemExit("batch manifest JSON must be an array of job dicts")
+        return data
+    if suffix == ".csv":
+        rows: list[dict] = []
+        try:
+            with path.open(newline="", encoding="utf-8-sig") as f:
+                for row in csv.DictReader(f):
+                    rows.append({k: v for k, v in row.items() if v != ""})
+        except (OSError, csv.Error) as e:
+            raise SystemExit(f"batch manifest CSV error: {path}: {e}")
+        return rows
+    raise SystemExit(f"unsupported manifest format: {suffix}")
+
+
+def job_from_dict(d: dict, defaults: argparse.Namespace, manifest_dir: Path,
+                  idx: int) -> Job:
+    def _path(key: str) -> Path | None:
+        v = d.get(key)
+        if v in (None, ""):
+            return None
+        p = Path(v)
+        return p if p.is_absolute() else (manifest_dir / p).resolve()
+
+    def _float(key: str, fb: float) -> float:
+        v = d.get(key)
+        return float(v) if v not in (None, "") else fb
+
+    def _str(key: str, fb: str) -> str:
+        v = d.get(key)
+        return str(v) if v not in (None, "") else fb
+
+    def _bool(key: str, fb: bool) -> bool:
+        v = d.get(key)
+        if isinstance(v, bool):
+            return v
+        if v in (None, ""):
+            return fb
+        return str(v).lower() in ("1", "true", "yes", "on")
+
+    srt_path = _path("srt")
+    plan_path = _path("plan")
+    if srt_path is None:
+        raise SystemExit(f"manifest row {idx}: missing srt")
+    if plan_path is None:
+        raise SystemExit(f"manifest row {idx}: missing plan")
+
+    job_name = _str("name", plan_path.stem)
+    explicit_output = _path("output")
+    if explicit_output is None:
+        # Auto-isolate outputs by index so two jobs with the same name never
+        # silently overwrite each other.
+        explicit_output = (
+            manifest_dir / f"final_srt_driven_{safe_ascii_name(job_name)}_{idx:02d}.mp4"
+        )
+
+    row_mode = _str("mode", getattr(defaults, "mode", "full"))
+    if row_mode not in ("full", "extract"):
+        raise SystemExit(
+            f"manifest row {idx}: invalid mode {row_mode!r}; "
+            "expected 'full' or 'extract'"
+        )
+
+    return Job(
+        source=_path("source"),
+        srt=srt_path,
+        plan=plan_path,
+        voice=_path("voice"),
+        bg_volume=_float("bg_volume", defaults.bg_volume),
+        tolerance=_float("tolerance", defaults.tolerance),
+        trim_direction=_str("trim_direction", defaults.trim_direction),
+        on_short=_str("on_short", defaults.on_short),
+        style=_str("style", defaults.style),
+        fontsdir=_path("fontsdir"),
+        output=explicit_output,
+        name=job_name,
+        no_cache=_bool("no_cache", defaults.no_cache),
+        keep_intermediates=_bool("keep_intermediates", defaults.keep_intermediates),
+        no_overwrite=_bool("no_overwrite", defaults.no_overwrite),
+        mode=row_mode,
+    )
+
+
+# ============================================================================
+# CLI
+# ============================================================================
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description="SRT-driven edit assembly")
+    ap.add_argument("--source", type=Path, default=None,
+                    help="Form A: single source.mp4. Ignored if plan declares sources.")
+    ap.add_argument("--srt", type=Path, default=None, help="script.srt")
+    ap.add_argument("--plan", type=Path, default=None, help="edit_plan.json (Form A or B)")
+    ap.add_argument("--voice", type=Path, default=None,
+                    help="Global voice.wav spanning the whole timeline. "
+                         "Mutually exclusive with per-segment voices in the plan.")
+    ap.add_argument("--bg-volume", type=float, default=0.0,
+                    help="original audio level (0.0=mute, 0.1=10%%). Default 0.0.")
+    ap.add_argument("--tolerance", type=float, default=0.5,
+                    help="seconds. |source_dur - srt_dur| > tolerance triggers trim/error.")
+    ap.add_argument("--trim-direction", choices=["tail", "head", "center"], default="tail")
+    ap.add_argument("--on-short", choices=["error", "pad"], default="error")
+    ap.add_argument("--style", default="auto",
+                    help=f"subtitle style. Templates: {sorted(STYLE_TEMPLATES)}. "
+                         "'auto' picks cjk-natural if SRT has CJK, else bold-uppercase. "
+                         "Pass a raw ASS string containing '=' to override.")
+    ap.add_argument("--fontsdir", type=Path, default=None,
+                    help="extra fonts directory passed to libass.")
+    ap.add_argument("-o", "--output", type=Path, default=None)
+    ap.add_argument(
+        "--mode", choices=["full", "extract"], default="full",
+        help="'full' (default) runs extract -> concat -> subtitle burn. "
+             "'extract' stops after segment extraction and saves per-cue "
+             "clips to <edit_dir>/extracted_clips_<job>/clip_<id>.mp4; "
+             "gap clips, voice mixing, subtitle burn, and QC report are "
+             "skipped.",
+    )
+    ap.add_argument("--no-cache", action="store_true")
+    ap.add_argument("--no-overwrite", action="store_true",
+                    help="refuse to run if output file already exists.")
+    ap.add_argument("--keep-intermediates", action="store_true",
+                    help="keep the temp work dir (clips, base, concat list) after rendering.")
+    ap.add_argument("--batch", type=Path, default=None,
+                    help="run a batch manifest (jobs.json or jobs.csv) instead.")
+    ap.add_argument("--continue-on-error", action="store_true",
+                    help="when --batch: skip failing jobs instead of aborting.")
+    args = ap.parse_args()
+
+    versions = preflight()
+    print(f"== preflight: ffmpeg {versions['ffmpeg']} / ffprobe {versions['ffprobe']} ==")
+
+    if args.batch is not None:
+        manifest_path = args.batch.resolve()
+        rows = load_manifest(manifest_path)
+        results: list[dict] = []
+        for i, row in enumerate(rows):
+            try:
+                job = job_from_dict(row, args, manifest_path.parent, i)
+            except (SystemExit, Exception) as e:
+                if args.continue_on_error:
+                    print(f"[batch {i}] skipped: {type(e).__name__}: {e}")
+                    results.append(make_failure_record(
+                        index=i, name=row.get("name", f"row{i}"),
+                        error=e, job=None, manifest_row=row,
+                    ))
+                    continue
+                raise
+            try:
+                results.append(run_job(job, versions["ffmpeg"]))
+            except (SystemExit, Exception) as e:
+                if args.continue_on_error:
+                    print(f"[batch {i}] FAILED: {type(e).__name__}: {e}")
+                    results.append(make_failure_record(
+                        index=i, name=job.name, error=e, job=job,
+                    ))
+                    continue
+                raise
+        summary_path = manifest_path.with_name(manifest_path.stem + "_qc_summary.json")
+        summary_path.write_text(
+            json.dumps({"jobs": results, "total": len(results),
+                        "ok": sum(1 for r in results if r.get("ok"))}, indent=2, ensure_ascii=False),
+            encoding="utf-8",
+        )
+        print(f"\nbatch QC summary → {summary_path}")
+        ok = sum(1 for r in results if r.get("ok"))
+        print(f"  {ok}/{len(results)} jobs ok")
+        return
+
+    if args.srt is None or args.plan is None:
+        ap.error("--srt and --plan required (or use --batch)")
+
+    job = Job(
+        source=args.source.resolve() if args.source else None,
+        srt=args.srt.resolve(),
+        plan=args.plan.resolve(),
+        voice=args.voice.resolve() if args.voice else None,
+        bg_volume=args.bg_volume,
+        tolerance=args.tolerance,
+        trim_direction=args.trim_direction,
+        on_short=args.on_short,
+        style=args.style,
+        fontsdir=args.fontsdir.resolve() if args.fontsdir else None,
+        output=args.output.resolve() if args.output else None,
+        name=args.plan.stem,
+        no_cache=args.no_cache,
+        keep_intermediates=args.keep_intermediates,
+        no_overwrite=args.no_overwrite,
+        mode=args.mode,
+    )
+    run_job(job, versions["ffmpeg"])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/helpers/transcribe.py b/helpers/transcribe.py
index 26d3906..a8b28ac 100644
--- a/helpers/transcribe.py
+++ b/helpers/transcribe.py
@@ -1,16 +1,37 @@
-"""Transcribe a video with ElevenLabs Scribe.
+"""Transcribe a video with Alibaba DashScope Paraformer-v2 (realtime, file mode).
+
+Extracts mono 16kHz PCM audio via ffmpeg, streams it to DashScope's
+paraformer-realtime-v2 model via the official `dashscope` SDK, and
+writes a Scribe-compatible JSON transcript so the downstream
+recommend_edit_plan helper keeps working without changes.
+
+Output schema (intentionally Scribe-shaped):
+    {
+      "language_code": "auto" | "<lang>",
+      "_source": "dashscope-paraformer-realtime-v2",
+      "words": [
+        {"text": "你好", "start": 1.234, "end": 1.567, "type": "word"},
+        ...
+      ]
+    }
+
+Tradeoffs vs the previous ElevenLabs Scribe integration:
+  - No speaker diarization — paraformer does not segment speakers,
+    so `speaker_id` is omitted from every word record.
+  - No audio events — Scribe's "(laughter)" / "(applause)" entries
+    with `"type": "audio_event"` are simply absent.
+  - The `--num-speakers` flag is accepted by transcribe_one for
+    backward compatibility with transcribe_batch but ignored.
 
-Extracts mono 16kHz audio via ffmpeg, uploads to Scribe with verbatim +
-diarize + audio events + word-level timestamps, writes the full response
-to <edit_dir>/transcripts/<video_stem>.json.
+Cached: if the output transcript already exists, the API call is skipped.
 
-Cached: if the output file already exists, the upload is skipped.
+API key:
+    DASHSCOPE_API_KEY in <repo>/.env or in the environment.
 
 Usage:
     python helpers/transcribe.py <video_path>
+    python helpers/transcribe.py <video_path> --language zh
     python helpers/transcribe.py <video_path> --edit-dir /custom/edit
-    python helpers/transcribe.py <video_path> --language en
-    python helpers/transcribe.py <video_path> --num-speakers 2
 """
 
 from __future__ import annotations
@@ -24,29 +45,34 @@
 import time
 from pathlib import Path
 
-import requests
 
-
-SCRIBE_URL = "https://api.elevenlabs.io/v1/speech-to-text"
+DASHSCOPE_MODEL = "paraformer-realtime-v2"
+ENV_VAR = "DASHSCOPE_API_KEY"
 
 
 def load_api_key() -> str:
+    """Read DASHSCOPE_API_KEY from <repo>/.env, ./.env, or the environment."""
     for candidate in [Path(__file__).resolve().parent.parent / ".env", Path(".env")]:
         if candidate.exists():
-            for line in candidate.read_text().splitlines():
+            for line in candidate.read_text(encoding="utf-8").splitlines():
                 line = line.strip()
                 if not line or line.startswith("#") or "=" not in line:
                     continue
                 k, v = line.split("=", 1)
-                if k.strip() == "ELEVENLABS_API_KEY":
+                if k.strip() == ENV_VAR:
                     return v.strip().strip('"').strip("'")
-    v = os.environ.get("ELEVENLABS_API_KEY", "")
+    v = os.environ.get(ENV_VAR, "")
     if not v:
-        sys.exit("ELEVENLABS_API_KEY not found in .env or environment")
+        sys.exit(
+            f"{ENV_VAR} not found in .env or environment. "
+            f"Generate one at https://dashscope.console.aliyun.com/ "
+            f"and put `{ENV_VAR}=...` in <repo>/.env."
+        )
     return v
 
 
 def extract_audio(video_path: Path, dest: Path) -> None:
+    """Extract mono 16kHz PCM WAV — the format paraformer-v2 expects."""
     cmd = [
         "ffmpeg", "-y", "-i", str(video_path),
         "-vn", "-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le",
@@ -55,36 +81,109 @@ def extract_audio(video_path: Path, dest: Path) -> None:
     subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
 
-def call_scribe(
+def _convert_dashscope_to_scribe(
+    sentences: list[dict],
+    language_hint: str | None,
+) -> dict:
+    """Flatten DashScope sentence/word structure into Scribe-compatible shape.
+
+    DashScope returns:
+        sentence: [
+          {begin_time, end_time, text,
+           words: [{begin_time, end_time, text, punctuation}, ...]}
+        ]
+
+    recommend_edit_plan.load_transcript_words wants a flat words[] with
+    seconds-based start/end and a 'word' type marker. Convert here so the
+    consumer stays Scribe-shaped and we don't need to touch recommender code.
+
+    Punctuation tokens that DashScope splits onto their own word entry are
+    folded into the preceding word's text — closer to how Scribe formatted
+    them. Empty / whitespace-only text entries are dropped.
+    """
+    words: list[dict] = []
+    for sent in sentences or []:
+        for w in (sent.get("words") or []):
+            text = (w.get("text") or "").strip()
+            if not text:
+                continue
+            punct = (w.get("punctuation") or "").strip()
+            try:
+                start_ms = float(w.get("begin_time") or 0)
+                end_ms = float(w.get("end_time") or 0)
+            except (TypeError, ValueError):
+                continue
+            words.append({
+                "text": text + punct,
+                "start": start_ms / 1000.0,
+                "end": end_ms / 1000.0,
+                "type": "word",
+            })
+    return {
+        "language_code": language_hint or "auto",
+        "_source": f"dashscope-{DASHSCOPE_MODEL}",
+        "words": words,
+    }
+
+
+def call_dashscope(
     audio_path: Path,
     api_key: str,
     language: str | None = None,
-    num_speakers: int | None = None,
 ) -> dict:
-    data: dict[str, str] = {
-        "model_id": "scribe_v1",
-        "diarize": "true",
-        "tag_audio_events": "true",
-        "timestamps_granularity": "word",
-    }
-    if language:
-        data["language_code"] = language
-    if num_speakers:
-        data["num_speakers"] = str(num_speakers)
-
-    with open(audio_path, "rb") as f:
-        resp = requests.post(
-            SCRIBE_URL,
-            headers={"xi-api-key": api_key},
-            files={"file": (audio_path.name, f, "audio/wav")},
-            data=data,
-            timeout=1800,
+    """Call paraformer-realtime-v2 in file mode. Returns Scribe-shaped dict.
+
+    The dashscope SDK handles WebSocket framing internally when given a
+    local file path — no manual chunking required. Defensive against
+    minor SDK shape variations: tolerates both `output.sentence` and
+    `output.sentences` (the docs and the wire format have shifted).
+    """
+    try:
+        import dashscope
+        from dashscope.audio.asr import Recognition
+    except ImportError:
+        raise SystemExit(
+            "dashscope package not installed. Install with:\n"
+            "  pip install dashscope\n"
+            "(or `pip install -e .` from the repo root once dashscope is in "
+            "your project deps)."
         )
 
-    if resp.status_code != 200:
-        raise RuntimeError(f"Scribe returned {resp.status_code}: {resp.text[:500]}")
+    dashscope.api_key = api_key
+    # Pin to the Mainland China endpoints explicitly. Both URLs are the SDK
+    # defaults, but stale DASHSCOPE_HTTP_BASE_URL / DASHSCOPE_WEBSOCKET_BASE_URL
+    # env vars (left over from an international account) would otherwise route
+    # us to the wrong region and produce a misleading 401 from the intl host
+    # even when the key is valid on the domestic side.
+    dashscope.base_http_api_url = "https://dashscope.aliyuncs.com/api/v1"
+    dashscope.base_websocket_api_url = (
+        "wss://dashscope.aliyuncs.com/api-ws/v1/inference"
+    )
+
+    language_hints = [language] if language else None
+
+    recognition = Recognition(
+        model=DASHSCOPE_MODEL,
+        format="wav",
+        sample_rate=16000,
+        language_hints=language_hints,
+        callback=None,
+    )
+    response = recognition.call(file=str(audio_path))
+
+    status = getattr(response, "status_code", None)
+    if status != 200:
+        msg = getattr(response, "message", None) or str(response)
+        request_id = getattr(response, "request_id", "")
+        raise RuntimeError(
+            f"DashScope {DASHSCOPE_MODEL} returned status={status} "
+            f"request_id={request_id}: {msg}"
+        )
 
-    return resp.json()
+    output = getattr(response, "output", None) or {}
+    # Both shapes seen in the wild; honour either.
+    sentences = output.get("sentence") or output.get("sentences") or []
+    return _convert_dashscope_to_scribe(sentences, language)
 
 
 def transcribe_one(
@@ -97,8 +196,19 @@ def transcribe_one(
 ) -> Path:
     """Transcribe a single video. Returns path to transcript JSON.
 
+    `num_speakers` is accepted for backward compatibility with the previous
+    ElevenLabs Scribe interface (and with transcribe_batch.py's call site)
+    but is ignored — paraformer does not perform speaker diarization. A
+    one-line note is printed when a non-None value is supplied in verbose mode.
+
     Cached: returns existing path immediately if the transcript already exists.
     """
+    if num_speakers is not None and verbose:
+        print(
+            f"  (note: --num-speakers={num_speakers} ignored — DashScope "
+            f"{DASHSCOPE_MODEL} has no speaker diarization)"
+        )
+
     transcripts_dir = edit_dir / "transcripts"
     transcripts_dir.mkdir(parents=True, exist_ok=True)
     out_path = transcripts_dir / f"{video.stem}.json"
@@ -117,23 +227,33 @@ def transcribe_one(
         extract_audio(video, audio)
         size_mb = audio.stat().st_size / (1024 * 1024)
         if verbose:
-            print(f"  uploading {video.stem}.wav ({size_mb:.1f} MB)", flush=True)
-        payload = call_scribe(audio, api_key, language, num_speakers)
-
-    out_path.write_text(json.dumps(payload, indent=2))
+            print(
+                f"  streaming {video.stem}.wav ({size_mb:.1f} MB) "
+                f"to DashScope {DASHSCOPE_MODEL}",
+                flush=True,
+            )
+        payload = call_dashscope(audio, api_key, language)
+
+    # ensure_ascii=False so CJK characters are stored as-is (smaller file +
+    # human-readable when inspecting transcripts).
+    out_path.write_text(
+        json.dumps(payload, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
     dt = time.time() - t0
 
     if verbose:
         kb = out_path.stat().st_size / 1024
-        print(f"  saved: {out_path.name} ({kb:.1f} KB) in {dt:.1f}s")
-        if isinstance(payload, dict) and "words" in payload:
-            print(f"    words: {len(payload['words'])}")
+        words_count = len(payload.get("words", []))
+        print(f"  saved: {out_path.name} ({kb:.1f} KB, {words_count} words) in {dt:.1f}s")
 
     return out_path
 
 
 def main() -> None:
-    ap = argparse.ArgumentParser(description="Transcribe a video with ElevenLabs Scribe")
+    ap = argparse.ArgumentParser(
+        description=f"Transcribe a video with DashScope {DASHSCOPE_MODEL}"
+    )
     ap.add_argument("video", type=Path, help="Path to video file")
     ap.add_argument(
         "--edit-dir",
@@ -145,13 +265,7 @@ def main() -> None:
         "--language",
         type=str,
         default=None,
-        help="Optional ISO language code (e.g., 'en'). Omit to auto-detect.",
-    )
-    ap.add_argument(
-        "--num-speakers",
-        type=int,
-        default=None,
-        help="Optional number of speakers when known. Improves diarization accuracy.",
+        help="Language hint (e.g. 'zh', 'en', 'ja'). Omit to auto-detect.",
     )
     args = ap.parse_args()
 
@@ -167,7 +281,6 @@ def main() -> None:
         edit_dir=edit_dir,
         api_key=api_key,
         language=args.language,
-        num_speakers=args.num_speakers,
     )
 
 
diff --git a/helpers/transcribe_batch.py b/helpers/transcribe_batch.py
index 5aeb1d6..3fe86e0 100644
--- a/helpers/transcribe_batch.py
+++ b/helpers/transcribe_batch.py
@@ -1,14 +1,15 @@
 """Batch-transcribe every video in a directory with 4 parallel workers.
 
-Walks <videos_dir> for common video extensions, runs ElevenLabs Scribe on
-each, writes transcripts to <videos_dir>/edit/transcripts/<name>.json.
+Walks <videos_dir> for common video extensions, transcribes each via
+DashScope paraformer-v2 (see helpers/transcribe.py), writes transcripts
+to <videos_dir>/edit/transcripts/<name>.json.
 
 Cached per-file: any source that already has a transcript is skipped.
 
 Usage:
     python helpers/transcribe_batch.py <videos_dir>
     python helpers/transcribe_batch.py <videos_dir> --workers 4
-    python helpers/transcribe_batch.py <videos_dir> --num-speakers 2
+    python helpers/transcribe_batch.py <videos_dir> --language zh
     python helpers/transcribe_batch.py <videos_dir> --edit-dir /custom/edit
 """
 
@@ -48,13 +49,7 @@ def main() -> None:
         "--language",
         type=str,
         default=None,
-        help="Optional ISO language code. Omit to auto-detect per file.",
-    )
-    ap.add_argument(
-        "--num-speakers",
-        type=int,
-        default=None,
-        help="Optional number of speakers. Improves diarization when known.",
+        help="Language hint (e.g. 'zh', 'en'). Omit to auto-detect per file.",
     )
     args = ap.parse_args()
 
@@ -91,7 +86,6 @@ def main() -> None:
                 edit_dir=edit_dir,
                 api_key=api_key,
                 language=args.language,
-                num_speakers=args.num_speakers,
                 verbose=False,
             ): v
             for v in pending
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..3e43b97
--- /dev/null
+++ b/main.py
@@ -0,0 +1,86 @@
+"""Project-root entry point for the SRT-driven editor.
+
+A thin wrapper over `helpers/srt_driven_edit.py` that fills in the
+`input/` -> `output/` layout described in CLAUDE.md so the common case
+collapses to a single command:
+
+    python main.py
+
+The wrapper injects these defaults only when the corresponding flag is
+absent from `sys.argv`:
+
+    --srt    input/script.srt          (always; required by srt_driven_edit)
+    --plan   input/edit_plan.json      (always; required by srt_driven_edit)
+    --source input/source.mp4          (only if the file exists)
+    --voice  input/voice.wav           (only if the file exists)
+    -o       output/final.mp4          (always; output/ is auto-created)
+
+Anything you pass explicitly wins. Batch mode (`--batch <manifest>`) skips
+all single-job defaults so the manifest fully owns its own paths.
+
+Examples:
+    python main.py
+    python main.py --bg-volume 0.1 --style cjk-natural
+    python main.py --plan plans/custom.json -o out/custom.mp4
+    python main.py --batch jobs.json --continue-on-error
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+# Wire helpers/ onto sys.path so `from srt_driven_edit import ...` works
+# regardless of the user's cwd.
+ROOT = Path(__file__).resolve().parent
+sys.path.insert(0, str(ROOT / "helpers"))
+
+from srt_driven_edit import main as _srt_driven_main  # noqa: E402
+
+
+def _has_flag(args: list[str], *flags: str) -> bool:
+    """True if any of `flags` appears in `args`, in either bare or `=` form."""
+    for token in args:
+        for f in flags:
+            if token == f or token.startswith(f + "="):
+                return True
+    return False
+
+
+def _inject_defaults(args: list[str]) -> list[str]:
+    """Add input/ -> output/ defaults for the flags the user did not provide."""
+    out = list(args)
+
+    # Batch mode owns its own paths via the manifest — never inject.
+    if _has_flag(out, "--batch"):
+        return out
+
+    if not _has_flag(out, "--srt"):
+        out += ["--srt", "input/script.srt"]
+    if not _has_flag(out, "--plan"):
+        out += ["--plan", "input/edit_plan.json"]
+
+    # --source is required for Form A plans but ignored for Form B. Inject
+    # only when the file is actually present so Form B users with no
+    # input/source.mp4 don't get a misleading "missing on disk" error.
+    if not _has_flag(out, "--source") and (ROOT / "input/source.mp4").exists():
+        out += ["--source", "input/source.mp4"]
+
+    # Same idea for voice: it's always optional, so inject only when present.
+    if not _has_flag(out, "--voice") and (ROOT / "input/voice.wav").exists():
+        out += ["--voice", "input/voice.wav"]
+
+    if not _has_flag(out, "-o", "--output"):
+        (ROOT / "output").mkdir(exist_ok=True)
+        out += ["-o", "output/final.mp4"]
+
+    return out
+
+
+def main() -> None:
+    sys.argv = [sys.argv[0]] + _inject_defaults(sys.argv[1:])
+    _srt_driven_main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index c2cff29..651d0fe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,6 +6,7 @@ license = { file = "LICENSE" }
 requires-python = ">=3.10"
 dependencies = [
     "requests",
+    "dashscope>=1.20",
     "librosa",
     "matplotlib",
     "pillow",
@@ -14,6 +15,7 @@ dependencies = [
 
 [project.optional-dependencies]
 animations = ["manim"]
+dev = ["pytest>=7"]
 
 [build-system]
 requires = ["setuptools>=61.0"]
diff --git a/srt_video_editor.py b/srt_video_editor.py
new file mode 100644
index 0000000..2477e4b
--- /dev/null
+++ b/srt_video_editor.py
@@ -0,0 +1,478 @@
+"""srt_video_editor — minimal viable, learning-grade scaffold.
+
+================================================================
+THIS IS NOT THE PRODUCTION ENTRY POINT. Use `python main.py` (or
+`python helpers/srt_driven_edit.py` directly) for any real work.
+================================================================
+
+What this script does:
+  - reads script.srt + edit_plan.json
+  - validates ids match
+  - prints the planned mapping
+  - cuts each cue out of source.mp4 to temp/clip_<id:03d>.mp4
+  - lossless-concats into output/final.mp4
+
+What it deliberately DOES NOT do (use main.py / srt_driven_edit.py
+for any of these):
+  - encoding fallback — only UTF-8 / UTF-8-with-BOM SRT is accepted;
+    GB18030 / cp936 input will crash
+  - source range bounds check — a plan that overruns the source's
+    duration will surface as a confusing ffmpeg error, not a clear
+    "id=X exceeds source duration" up front
+  - QC report — no per-clip drift, no disk-usage accounting, no
+    structured failure record
+  - overwrite protection — every run silently `-y` overwrites the
+    temp/ clips and output/final.mp4
+  - audio fades at cut points (you may hear pops on hard cuts)
+  - voice replacement, subtitle burn, color grade, HDR tone-map,
+    sync tails, segment cache, batch / per-episode discovery
+
+Self-contained on purpose: no imports from helpers/, so the entire
+flow fits in one readable file. Use this to learn the pipeline; ship
+with main.py.
+
+Usage:
+    python srt_video_editor.py
+    python srt_video_editor.py --srt input/script.srt --plan input/edit_plan.json \\
+                               --source input/source.mp4 \\
+                               --temp-dir temp/ --output output/final.mp4
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+
+# ---------- timestamp helpers ----------
+
+_TS_RE = re.compile(r"(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})")
+
+
+def parse_ts(s: str) -> float:
+    """Parse 'HH:MM:SS,ms' or 'HH:MM:SS.ms' to seconds."""
+    m = _TS_RE.fullmatch(s.strip())
+    if not m:
+        raise ValueError(f"bad timestamp: {s!r}")
+    h, mn, sec, ms = m.groups()
+    return int(h) * 3600 + int(mn) * 60 + int(sec) + int(ms.ljust(3, "0")) / 1000.0
+
+
+def format_ts(seconds: float) -> str:
+    """SRT-style HH:MM:SS,ms — comma separator (for log output / errors)."""
+    total_ms = int(round(seconds * 1000))
+    h, rem = divmod(total_ms, 3600_000)
+    m, rem = divmod(rem, 60_000)
+    s, ms = divmod(rem, 1000)
+    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+
+
+def format_ts_dot(seconds: float) -> str:
+    """ffmpeg-style HH:MM:SS.ms — dot separator. Used for `-ss` / `-t` args.
+
+    SRT timestamps use a comma between seconds and milliseconds; ffmpeg
+    expects a dot. The two forms refer to the same point in time but the
+    comma form is rejected by ffmpeg's parser.
+    """
+    return format_ts(seconds).replace(",", ".")
+
+
+# ---------- parsers ----------
+
+
+def parse_srt(path: Path) -> list[dict]:
+    """Return a list of {id, start, end, text} in file order.
+
+    Tolerates UTF-8 with or without BOM, CRLF / LF line endings, and
+    SRT cue settings ('position:90% align:start') trailing the time line.
+
+    Per-cue duration is validated here: `end <= start` is rejected with
+    an id-pinned error so the downstream ffmpeg call never sees a
+    non-positive `-t` argument.
+    """
+    raw = path.read_text(encoding="utf-8-sig")
+    cues: list[dict] = []
+    for block in re.split(r"\r?\n\r?\n+", raw.strip()):
+        lines = [ln for ln in block.splitlines() if ln.strip()]
+        if len(lines) < 2:
+            continue
+        try:
+            cid = int(lines[0].strip())
+        except ValueError:
+            raise SystemExit(f"SRT id line is not an integer: {lines[0]!r}")
+        if "-->" not in lines[1]:
+            raise SystemExit(f"SRT block missing '-->' time line: {lines[1]!r}")
+        left, right = lines[1].split("-->", 1)
+        start = parse_ts(left.strip().split()[-1])
+        end = parse_ts(right.strip().split()[0])
+        if end <= start:
+            raise SystemExit(
+                f"SRT id={cid}: end {format_ts(end)} <= start "
+                f"{format_ts(start)} (srt_duration {end - start:.3f}s). "
+                f"Fix the timestamp in {path}."
+            )
+        text = "\n".join(lines[2:])
+        cues.append({"id": cid, "start": start, "end": end, "text": text})
+    if not cues:
+        raise SystemExit(f"SRT has no cues: {path}")
+    return cues
+
+
+def parse_plan(path: Path) -> list[dict]:
+    """Return a list of {id, source_start, source_end}. Only Form A is
+    accepted here (a flat JSON array); Form B is out of scope for the
+    minimal version.
+
+    JSON syntax errors are reported as a SystemExit with the file path
+    plus the offending line / column / message, rather than as a bare
+    JSONDecodeError traceback.
+    """
+    raw = path.read_text(encoding="utf-8")
+    try:
+        data = json.loads(raw)
+    except json.JSONDecodeError as e:
+        raise SystemExit(
+            f"edit_plan is not valid JSON: {path}: "
+            f"line {e.lineno} col {e.colno}: {e.msg}"
+        )
+    if not isinstance(data, list):
+        raise SystemExit(
+            "edit_plan.json must be a JSON array of "
+            "{id, source_start, source_end} objects (Form A)."
+        )
+    out: list[dict] = []
+    for row in data:
+        try:
+            out.append({
+                "id": int(row["id"]),
+                "source_start": parse_ts(row["source_start"]),
+                "source_end": parse_ts(row["source_end"]),
+            })
+        except (KeyError, ValueError) as e:
+            raise SystemExit(f"plan row {row!r}: {e}")
+    return out
+
+
+# ---------- validation ----------
+
+
+def validate_ids(cues: list[dict], plan: list[dict]) -> None:
+    """Each id must appear exactly once in both sides, and the two id sets
+    must be equal. Any deviation is a hard failure with a clear message.
+    """
+    cue_ids = [c["id"] for c in cues]
+    plan_ids = [p["id"] for p in plan]
+
+    dup_cue = {i for i in cue_ids if cue_ids.count(i) > 1}
+    if dup_cue:
+        raise SystemExit(f"SRT has duplicate ids: {sorted(dup_cue)}")
+    dup_plan = {i for i in plan_ids if plan_ids.count(i) > 1}
+    if dup_plan:
+        raise SystemExit(f"edit_plan has duplicate ids: {sorted(dup_plan)}")
+
+    only_srt = set(cue_ids) - set(plan_ids)
+    only_plan = set(plan_ids) - set(cue_ids)
+    if only_srt or only_plan:
+        msg = []
+        if only_srt:
+            msg.append(f"in SRT but missing in plan: {sorted(only_srt)}")
+        if only_plan:
+            msg.append(f"in plan but missing in SRT: {sorted(only_plan)}")
+        raise SystemExit("id mismatch: " + "; ".join(msg))
+
+
+# ---------- report ----------
+
+
+def probe_clip_duration(path: Path) -> float | None:
+    """Return the duration of `path` in seconds via ffprobe.
+
+    Returns None if ffprobe is missing or the file is unreadable —
+    verification is informational, so probe failures should not abort
+    the run after a successful extraction.
+    """
+    cmd = [
+        "ffprobe", "-v", "error",
+        "-show_entries", "format=duration",
+        "-of", "default=noprint_wrappers=1:nokey=1",
+        str(path),
+    ]
+    try:
+        proc = subprocess.run(
+            cmd, capture_output=True, text=True,
+            encoding="utf-8", errors="replace",
+        )
+    except FileNotFoundError:
+        return None
+    if proc.returncode != 0:
+        return None
+    try:
+        return float(proc.stdout.strip())
+    except ValueError:
+        return None
+
+
+def cut_clip(source: Path, start: float, cut_duration: float,
+             out_path: Path) -> None:
+    """Cut `cut_duration` seconds starting at `start` from source.
+
+    `-ss` placed before `-i` makes ffmpeg do a fast container-level seek
+    to the nearest keyframe, then libx264 re-encodes from there —
+    frame-accurate at the cost of one encode pass. Stream copy (`-c copy`)
+    would be faster but cuts at keyframes only, which makes downstream
+    concat / sync less predictable; we trade a few seconds of encode
+    time per clip for cleaner cut boundaries.
+
+    Audio is mapped optionally via `-map 0:a?` so a video-only source
+    does not crash the run. Video is the first stream (`-map 0:v:0`).
+
+    Raises SystemExit with the full ffmpeg command + stderr on failure
+    so the caller never has to scroll the terminal to find what went wrong.
+    """
+    cmd = [
+        "ffmpeg", "-y", "-hide_banner", "-nostats",
+        "-ss", format_ts_dot(start),
+        "-i", str(source),
+        "-t", format_ts_dot(cut_duration),
+        "-map", "0:v:0",
+        "-map", "0:a?",
+        "-c:v", "libx264", "-preset", "veryfast", "-crf", "18",
+        "-pix_fmt", "yuv420p",
+        "-c:a", "aac", "-b:a", "192k",
+        "-ar", "48000", "-ac", "2",
+        "-movflags", "+faststart",
+        str(out_path),
+    ]
+    try:
+        proc = subprocess.run(
+            cmd, capture_output=True, text=True,
+            encoding="utf-8", errors="replace",
+        )
+    except FileNotFoundError:
+        raise SystemExit(
+            "ffmpeg not found on PATH. Install ffmpeg "
+            "(`winget install Gyan.FFmpeg` on Windows, "
+            "`brew install ffmpeg` on macOS) and re-run."
+        )
+    if proc.returncode != 0:
+        raise SystemExit(
+            f"ffmpeg failed on {out_path.name} (exit {proc.returncode})\n"
+            f"--- command ---\n{' '.join(cmd)}\n"
+            f"--- stderr ---\n{proc.stderr or '(empty)'}"
+        )
+
+
+def extract_clips(
+    cues: list[dict],
+    plan: list[dict],
+    source: Path,
+    temp_dir: Path,
+) -> list[Path]:
+    """Cut one clip per cue. Returns the list of output paths in cue-id order.
+
+    Per-cue duration logic:
+      source_duration = plan.source_end - plan.source_start
+      srt_duration    = cue.end - cue.start
+
+      source_duration <= 0           -> hard error pointing at the id
+      source_duration <  srt_duration -> hard error (source is too short
+                                        to cover the SRT cue; either
+                                        extend the source range or
+                                        shorten the cue)
+      source_duration >= srt_duration -> cut exactly `srt_duration`
+                                        starting at source_start. Any
+                                        extra source tail is discarded.
+
+    Stale `clip_*.mp4` files in `temp_dir` are removed before cutting so
+    a previous failed run with sparser ids doesn't leave misleading
+    leftovers next to the new clips. The `_concat.txt` from a future
+    concat step is NOT touched here — concat owns its own list file.
+
+    Filenames are `clip_<id:03d>.mp4`, indexed by SRT id (not position).
+    """
+    plan_by_id = {p["id"]: p for p in plan}
+    temp_dir.mkdir(parents=True, exist_ok=True)
+
+    # Pre-clean stale clip files. Only the clip_*.mp4 pattern so user-
+    # created neighbours (notes, recordings, etc.) are left alone.
+    stale = sorted(temp_dir.glob("clip_*.mp4"))
+    if stale:
+        print(f"clearing {len(stale)} stale clip(s) from {temp_dir}/")
+        for p in stale:
+            p.unlink()
+
+    print()
+    print(f"cutting {len(cues)} clip(s) -> {temp_dir}/")
+    outputs: list[Path] = []
+    targets: list[float] = []  # parallel to outputs — used by post-verify pass
+    for cue in sorted(cues, key=lambda c: c["id"]):
+        cid = cue["id"]
+        p = plan_by_id[cid]
+        start = p["source_start"]
+        source_duration = p["source_end"] - start
+        srt_duration = cue["end"] - cue["start"]
+
+        if source_duration <= 0:
+            raise SystemExit(
+                f"plan id={cid}: source_end {format_ts(p['source_end'])} <= "
+                f"source_start {format_ts(start)} "
+                f"(source_duration {source_duration:.3f}s)"
+            )
+        if source_duration < srt_duration - 1e-6:
+            raise SystemExit(
+                f"plan id={cid}: source range is shorter than SRT cue. "
+                f"source_duration={source_duration:.3f}s, "
+                f"srt_duration={srt_duration:.3f}s. "
+                f"Extend the source range or shorten the SRT cue."
+            )
+        # source_duration >= srt_duration: cut exactly srt_duration
+        cut_duration = srt_duration
+
+        out_path = temp_dir / f"clip_{cid:03d}.mp4"
+        text_preview = cue["text"].replace("\n", " ").strip()
+        if len(text_preview) > 60:
+            text_preview = text_preview[:57] + "..."
+        print(
+            f"  id={cid:>3}  src@{format_ts(start)}  "
+            f"cut={cut_duration:.3f}s  -> {out_path}\n"
+            f"        text: {text_preview!r}"
+        )
+        cut_clip(source, start, cut_duration, out_path)
+        outputs.append(out_path)
+        targets.append(cut_duration)
+
+    # ---- ffprobe verification ----
+    # Container duration can drift a few hundredths of a second from the
+    # target after re-encoding (libx264 GOP / first-keyframe boundary).
+    # Print the actual vs target side by side so the user can spot a
+    # clip that's wildly off — e.g. ffmpeg silently truncated to 0s.
+    print()
+    print(f"verifying {len(outputs)} clip(s) with ffprobe:")
+    for out_path, target in zip(outputs, targets):
+        actual = probe_clip_duration(out_path)
+        if actual is None:
+            print(f"  {out_path.name}: (probe failed)，target: {target:.2f}s")
+        else:
+            print(f"  {out_path.name}: {actual:.2f}s，target: {target:.2f}s")
+
+    return outputs
+
+
+def concat_clips(clip_paths: list[Path], out_path: Path) -> None:
+    """Lossless concat of pre-encoded clips via ffmpeg's concat demuxer.
+
+    The clips produced by `cut_clip` all share the same encoder params
+    (libx264, yuv420p, aac), so `-c copy` is safe and instant — no
+    re-encode. The concat list file is written next to the first clip
+    (typically `temp/_concat.txt`) and removed in `finally` so a clean
+    run leaves a tidy temp/ and a failed run doesn't leave a stale list.
+
+    Raises SystemExit with the full ffmpeg command + stderr on failure.
+    """
+    if not clip_paths:
+        raise SystemExit("concat: no clips to concatenate")
+
+    list_file = clip_paths[0].parent / "_concat.txt"
+    list_file.write_text(
+        "".join(f"file '{p.resolve().as_posix()}'\n" for p in clip_paths),
+        encoding="utf-8",
+    )
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    cmd = [
+        "ffmpeg", "-y", "-hide_banner", "-nostats",
+        "-f", "concat", "-safe", "0",
+        "-i", str(list_file),
+        "-c", "copy",
+        "-movflags", "+faststart",
+        str(out_path),
+    ]
+    try:
+        try:
+            proc = subprocess.run(
+                cmd, capture_output=True, text=True,
+                encoding="utf-8", errors="replace",
+            )
+        except FileNotFoundError:
+            raise SystemExit(
+                "ffmpeg not found on PATH. Install ffmpeg "
+                "(`winget install Gyan.FFmpeg` on Windows, "
+                "`brew install ffmpeg` on macOS) and re-run."
+            )
+        if proc.returncode != 0:
+            raise SystemExit(
+                f"ffmpeg concat failed (exit {proc.returncode})\n"
+                f"--- command ---\n{' '.join(cmd)}\n"
+                f"--- stderr ---\n{proc.stderr or '(empty)'}"
+            )
+    finally:
+        list_file.unlink(missing_ok=True)
+    print(f"  concat {len(clip_paths)} clip(s) -> {out_path}")
+
+
+def print_report(cues: list[dict], plan: list[dict]) -> None:
+    plan_by_id = {p["id"]: p for p in plan}
+    print(f"{len(cues)} cue(s), all ids matched.")
+    print()
+    header = f"  {'ID':>3}  {'OUTPUT (cue)':<23}  {'SOURCE (planned)':<23}  TEXT"
+    print(header)
+    print(f"  {'-' * 3}  {'-' * 23}  {'-' * 23}  {'-' * 4}")
+    for cue in sorted(cues, key=lambda c: c["id"]):
+        p = plan_by_id[cue["id"]]
+        out_range = f"{format_ts(cue['start'])} -> {format_ts(cue['end'])}"
+        src_range = f"{format_ts(p['source_start'])} -> {format_ts(p['source_end'])}"
+        preview = cue["text"].replace("\n", " ")
+        if len(preview) > 50:
+            preview = preview[:47] + "..."
+        print(f"  {cue['id']:>3}  {out_range:<23}  {src_range:<23}  {preview}")
+
+
+# ---------- entry ----------
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(
+        description=(
+            "MINIMAL learning-grade SRT-driven editor. NOT for production — "
+            "use `python main.py` for that. This script reads script.srt + "
+            "edit_plan.json, validates id matching, prints the planned range "
+            "table, cuts each cue out of source.mp4 into temp/clip_<id>.mp4, "
+            "then lossless-concats them into output/final.mp4. No encoding "
+            "fallback, no range-bounds check, no QC report, no overwrite "
+            "protection."
+        ),
+    )
+    ap.add_argument("--srt", type=Path, default=Path("input/script.srt"))
+    ap.add_argument("--plan", type=Path, default=Path("input/edit_plan.json"))
+    ap.add_argument("--source", type=Path, default=Path("input/source.mp4"))
+    ap.add_argument("--temp-dir", type=Path, default=Path("temp"))
+    ap.add_argument("--output", type=Path, default=Path("output/final.mp4"))
+    args = ap.parse_args()
+
+    print(
+        "[srt_video_editor: minimal mode — UTF-8 SRT only, no range/QC "
+        "checks, temp/ + output/ will be overwritten. For production "
+        "use `python main.py`.]"
+    )
+
+    for p in (args.srt, args.plan, args.source):
+        if not p.is_file():
+            raise SystemExit(f"file not found: {p}")
+
+    cues = parse_srt(args.srt)
+    plan = parse_plan(args.plan)
+    validate_ids(cues, plan)
+    print_report(cues, plan)
+    clip_paths = extract_clips(cues, plan, args.source, args.temp_dir)
+    print()
+    print(f"concatenating -> {args.output}")
+    concat_clips(clip_paths, args.output)
+    print()
+    print(f"done. final video: {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..7716455
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,153 @@
+"""Shared fixtures for srt_driven_edit pytest suite.
+
+Generates session-scoped synthetic media via ffmpeg's lavfi sources so the
+real extract/concat/burn pipeline can be exercised without bundling binary
+fixtures.
+"""
+
+from __future__ import annotations
+
+import json
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+# Make the skill's helpers/ importable as a flat package (matches the
+# `python helpers/srt_driven_edit.py` invocation contract).
+HELPERS = Path(__file__).resolve().parent.parent / "helpers"
+sys.path.insert(0, str(HELPERS))
+
+
+FFMPEG = shutil.which("ffmpeg")
+FFPROBE = shutil.which("ffprobe")
+
+
+def pytest_collection_modifyitems(config, items):
+    """Auto-skip all tests in this dir if ffmpeg/ffprobe missing."""
+    if FFMPEG and FFPROBE:
+        return
+    marker = pytest.mark.skip(reason="ffmpeg or ffprobe not on PATH")
+    for item in items:
+        item.add_marker(marker)
+
+
+# ---------------------------------------------------------------------------
+# Synthetic media (session-scoped — each costs a few seconds to render)
+# ---------------------------------------------------------------------------
+
+
+def _ffmpeg(*args: str) -> None:
+    cmd = ["ffmpeg", "-y", "-hide_banner", "-loglevel", "error", *args]
+    r = subprocess.run(cmd, capture_output=True, text=True,
+                       encoding="utf-8", errors="replace")
+    if r.returncode != 0:
+        raise RuntimeError(f"ffmpeg failed:\n  cmd: {' '.join(cmd)}\n  stderr: {r.stderr}")
+
+
+@pytest.fixture(scope="session")
+def synth_av(tmp_path_factory) -> Path:
+    """30s 1080p@24 testsrc2 + 440Hz sine. Spans long enough for sub-second cuts."""
+    d = tmp_path_factory.mktemp("synth")
+    out = d / "av.mp4"
+    _ffmpeg(
+        "-f", "lavfi", "-i", "testsrc2=size=1920x1080:rate=24:duration=30",
+        "-f", "lavfi", "-i", "sine=frequency=440:duration=30",
+        "-c:v", "libx264", "-preset", "ultrafast", "-pix_fmt", "yuv420p",
+        "-c:a", "aac", "-b:a", "128k", "-ar", "48000",
+        "-shortest",
+        str(out),
+    )
+    return out
+
+
+@pytest.fixture(scope="session")
+def synth_v_only(tmp_path_factory) -> Path:
+    """30s 1080p video without an audio track. Exercises the auto-degrade path."""
+    d = tmp_path_factory.mktemp("synth_vonly")
+    out = d / "v_only.mp4"
+    _ffmpeg(
+        "-f", "lavfi", "-i", "testsrc2=size=1920x1080:rate=24:duration=30",
+        "-an",
+        "-c:v", "libx264", "-preset", "ultrafast", "-pix_fmt", "yuv420p",
+        "-t", "30",
+        str(out),
+    )
+    return out
+
+
+@pytest.fixture(scope="session")
+def synth_voice(tmp_path_factory) -> Path:
+    """5s 880Hz sine — drop-in per-segment voice clip."""
+    d = tmp_path_factory.mktemp("synth_voice")
+    out = d / "voice.wav"
+    _ffmpeg(
+        "-f", "lavfi", "-i", "sine=frequency=880:duration=5",
+        "-ar", "48000", "-ac", "2",
+        str(out),
+    )
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Helpers for crafting SRT / plan files inside a test's tmp_path
+# ---------------------------------------------------------------------------
+
+
+def fmt_ts(s: float) -> str:
+    total_ms = int(round(s * 1000))
+    h, rem = divmod(total_ms, 3600_000)
+    m, rem = divmod(rem, 60_000)
+    sec, ms = divmod(rem, 1000)
+    return f"{h:02d}:{m:02d}:{sec:02d},{ms:03d}"
+
+
+def write_srt(path: Path, cues: list[tuple[int, float, float, str]],
+              encoding: str = "utf-8") -> None:
+    """Write an SRT. cues: [(id, start_s, end_s, text)]."""
+    lines: list[str] = []
+    for cid, s, e, t in cues:
+        lines.append(str(cid))
+        lines.append(f"{fmt_ts(s)} --> {fmt_ts(e)}")
+        lines.append(t)
+        lines.append("")
+    path.write_bytes("\n".join(lines).encode(encoding))
+
+
+def write_plan_form_a(path: Path,
+                      segments: list[tuple[int, float, float]]) -> None:
+    """Legacy array form. segments: [(id, src_start_s, src_end_s)]."""
+    data = [
+        {"id": cid, "source_start": fmt_ts(s), "source_end": fmt_ts(e)}
+        for cid, s, e in segments
+    ]
+    path.write_text(json.dumps(data, indent=2), encoding="utf-8")
+
+
+def write_plan_form_b(path: Path, sources: dict[str, str],
+                      segments: list[dict],
+                      voices: dict[str, str] | None = None) -> None:
+    """Object form with multi-source / multi-voice support."""
+    data: dict = {"sources": sources, "segments": segments}
+    if voices:
+        data["voices"] = voices
+    path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
+
+
+@pytest.fixture
+def helpers_ns():
+    """Convenience: bundle the helpers module + write_* functions in one object."""
+    import srt_driven_edit as sde
+
+    class NS:
+        pass
+
+    ns = NS()
+    ns.sde = sde
+    ns.write_srt = write_srt
+    ns.write_plan_form_a = write_plan_form_a
+    ns.write_plan_form_b = write_plan_form_b
+    ns.fmt_ts = fmt_ts
+    return ns
diff --git a/tests/test_main_entry.py b/tests/test_main_entry.py
new file mode 100644
index 0000000..579cb54
--- /dev/null
+++ b/tests/test_main_entry.py
@@ -0,0 +1,104 @@
+"""Tests for the project-root main.py wrapper.
+
+Only the default-injection logic is unit-tested here; the actual run_job
+path is exercised by tests/test_srt_driven_*.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import pytest
+
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+
+
+@pytest.fixture
+def main_mod(monkeypatch, tmp_path):
+    """Fresh import of main.py rooted at a tmp dir so ROOT/input doesn't leak."""
+    monkeypatch.chdir(tmp_path)
+    # Force-reload main with a new ROOT pointing at tmp_path so file-existence
+    # checks reflect what the test wrote, not what's actually in the repo root.
+    import importlib
+    import main as _m
+    importlib.reload(_m)
+    _m.ROOT = tmp_path  # rebind so input/source.mp4 etc. resolve in tmp
+    return _m
+
+
+def test_defaults_when_no_flags(main_mod, tmp_path):
+    """No flags + nothing in input/ → srt/plan/output defaults, no source/voice."""
+    out = main_mod._inject_defaults([])
+    assert "--srt" in out and "input/script.srt" in out
+    assert "--plan" in out and "input/edit_plan.json" in out
+    assert "-o" in out and "output/final.mp4" in out
+    # input/source.mp4 doesn't exist → --source NOT injected
+    assert "--source" not in out
+    assert "--voice" not in out
+    # output/ dir was created
+    assert (tmp_path / "output").is_dir()
+
+
+def test_injects_source_when_present(main_mod, tmp_path):
+    (tmp_path / "input").mkdir()
+    (tmp_path / "input" / "source.mp4").write_bytes(b"x")
+    out = main_mod._inject_defaults([])
+    assert "--source" in out and "input/source.mp4" in out
+
+
+def test_injects_voice_when_present(main_mod, tmp_path):
+    (tmp_path / "input").mkdir()
+    (tmp_path / "input" / "voice.wav").write_bytes(b"x")
+    out = main_mod._inject_defaults([])
+    assert "--voice" in out and "input/voice.wav" in out
+
+
+def test_user_flags_win(main_mod, tmp_path):
+    (tmp_path / "input").mkdir()
+    (tmp_path / "input" / "source.mp4").write_bytes(b"x")
+    user = ["--srt", "scripts/ep01.srt",
+            "--plan", "plans/ep01.json",
+            "--source", "raw/ep01.mp4",
+            "-o", "out/ep01.mp4"]
+    out = main_mod._inject_defaults(user)
+    # User-supplied wins; no duplicate defaults appended
+    assert out.count("--srt") == 1 and "scripts/ep01.srt" in out
+    assert out.count("--plan") == 1 and "plans/ep01.json" in out
+    assert out.count("--source") == 1 and "raw/ep01.mp4" in out
+    assert out.count("-o") == 1 and "out/ep01.mp4" in out
+    # Default input/script.srt etc. NOT injected
+    assert "input/script.srt" not in out
+    assert "input/edit_plan.json" not in out
+
+
+def test_equals_form_recognized(main_mod, tmp_path):
+    """--flag=value form must count as 'flag is set' so we don't double-inject."""
+    out = main_mod._inject_defaults(["--srt=scripts/x.srt", "--plan=plans/x.json"])
+    # Defaults must NOT be appended. Both the user's tokens and any default
+    # bare `--srt` / `--plan` would otherwise coexist.
+    assert "--srt=scripts/x.srt" in out
+    assert "--plan=plans/x.json" in out
+    assert "--srt" not in out          # no bare default flag
+    assert "--plan" not in out
+    assert "input/script.srt" not in out
+    assert "input/edit_plan.json" not in out
+
+
+def test_batch_mode_skips_all_defaults(main_mod, tmp_path):
+    (tmp_path / "input").mkdir()
+    (tmp_path / "input" / "source.mp4").write_bytes(b"x")
+    out = main_mod._inject_defaults(["--batch", "jobs.json"])
+    # No single-job defaults — manifest owns paths.
+    assert "--srt" not in out
+    assert "--plan" not in out
+    assert "--source" not in out
+    assert "-o" not in out
+    assert "--output" not in out
+
+
+def test_short_output_flag_recognized(main_mod, tmp_path):
+    out = main_mod._inject_defaults(["-o", "custom/path.mp4"])
+    assert out.count("-o") == 1
+    assert "output/final.mp4" not in out
diff --git a/tests/test_recommend_edit_plan.py b/tests/test_recommend_edit_plan.py
new file mode 100644
index 0000000..4453cde
--- /dev/null
+++ b/tests/test_recommend_edit_plan.py
@@ -0,0 +1,493 @@
+"""Tests for recommend_edit_plan."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pytest
+
+
+@pytest.fixture
+def rec():
+    """Convenience: import the module under test as a fixture."""
+    import recommend_edit_plan as r
+    return r
+
+
+@pytest.fixture
+def sde():
+    import srt_driven_edit as s
+    return s
+
+
+def write_transcript(path: Path, words: list[dict]) -> None:
+    """Wrap a flat list of {text,start,end,type} dicts in a Scribe-style envelope."""
+    path.write_text(
+        json.dumps({"language_code": "en", "words": words}, ensure_ascii=False),
+        encoding="utf-8",
+    )
+
+
+def write_srt_cues(path, cues, helpers_ns):
+    helpers_ns.write_srt(path, cues, encoding="utf-8")
+
+
+# ---------------------------------------------------------------------------
+# 1. English exact match — high score, correct range
+# ---------------------------------------------------------------------------
+
+
+def test_english_exact_match(rec, helpers_ns, tmp_path):
+    srt = tmp_path / "script.srt"
+    transcript = tmp_path / "transcript.json"
+    # Cue duration matches candidate duration exactly so duration warnings stay quiet.
+    helpers_ns.write_srt(srt, [
+        (1, 0.0, 1.0, "Hello world"),
+    ])
+    write_transcript(transcript, [
+        {"text": "Hello",  "start": 5.0, "end": 5.4, "type": "word"},
+        {"text": "world.", "start": 5.4, "end": 6.0, "type": "word"},
+        {"text": "Other",  "start": 10.0, "end": 10.5, "type": "word"},
+    ])
+
+    assignments = rec.recommend(
+        script_srt=srt, transcript=transcript, source=Path("fake.mp4"),
+        output=tmp_path / "plan.json",
+    )
+    assert len(assignments) == 1
+    a = assignments[0]
+    assert a.cand is not None
+    assert abs(a.cand.start - 5.0) < 1e-6
+    assert abs(a.cand.end - 6.0) < 1e-6
+    assert a.score > 0.85, f"exact-text match should score high, got {a.score}"
+    assert not a.warnings, f"unexpected warnings: {a.warnings}"
+
+
+# ---------------------------------------------------------------------------
+# 2. Chinese match — CJK Jaccard path
+# ---------------------------------------------------------------------------
+
+
+def test_chinese_match(rec, helpers_ns, tmp_path):
+    srt = tmp_path / "script.srt"
+    transcript = tmp_path / "transcript.json"
+    helpers_ns.write_srt(srt, [
+        (1, 0.0, 3.0, "我们这季度把规划器重写了。"),
+    ])
+    write_transcript(transcript, [
+        {"text": "我们",   "start": 12.0, "end": 12.4, "type": "word"},
+        {"text": "这",     "start": 12.4, "end": 12.5, "type": "word"},
+        {"text": "季度",   "start": 12.5, "end": 13.0, "type": "word"},
+        {"text": "把",     "start": 13.0, "end": 13.1, "type": "word"},
+        {"text": "规划器", "start": 13.1, "end": 14.0, "type": "word"},
+        {"text": "重写了。", "start": 14.0, "end": 15.0, "type": "word"},
+        # A distractor far away
+        {"text": "不相关的内容。", "start": 25.0, "end": 26.0, "type": "word"},
+    ])
+
+    assignments = rec.recommend(
+        script_srt=srt, transcript=transcript, source=Path("fake.mp4"),
+        output=tmp_path / "plan.json",
+    )
+    a = assignments[0]
+    assert a.cand is not None
+    assert abs(a.cand.start - 12.0) < 1e-6
+    assert abs(a.cand.end - 15.0) < 1e-6
+    assert a.score > 0.7
+
+
+# ---------------------------------------------------------------------------
+# 3. Punctuation + case differences still match
+# ---------------------------------------------------------------------------
+
+
+def test_punct_and_case_invariant(rec, helpers_ns, tmp_path):
+    srt = tmp_path / "script.srt"
+    transcript = tmp_path / "transcript.json"
+    # SRT: lowercase, no punct, matching duration
+    helpers_ns.write_srt(srt, [
+        (1, 0.0, 2.0, "hello there friends"),
+    ])
+    # Transcript: mixed case + phrase punct (commas keep words grouped); the
+    # SENTENCE-end '!' only on the last word so all three stay in one candidate.
+    write_transcript(transcript, [
+        {"text": "HELLO,",   "start": 1.0, "end": 1.5, "type": "word"},
+        {"text": "There,",   "start": 1.5, "end": 2.0, "type": "word"},
+        {"text": "FRIENDS!", "start": 2.0, "end": 3.0, "type": "word"},
+    ])
+    assignments = rec.recommend(
+        script_srt=srt, transcript=transcript, source=Path("fake.mp4"),
+        output=tmp_path / "plan.json",
+    )
+    a = assignments[0]
+    assert a.cand is not None
+    assert a.score > 0.85, f"normalization should erase case+punct, got {a.score}"
+
+
+# ---------------------------------------------------------------------------
+# 4. Silence gap splits candidates
+# ---------------------------------------------------------------------------
+
+
+def test_silence_gap_splits(rec, tmp_path):
+    """Two phrases separated by a 1.0s silence should produce two candidates,
+    not one — even though neither phrase ends in sentence-end punctuation.
+    """
+    transcript = tmp_path / "transcript.json"
+    write_transcript(transcript, [
+        {"text": "alpha", "start": 1.0, "end": 1.4, "type": "word"},
+        {"text": "beta",  "start": 1.4, "end": 2.0, "type": "word"},
+        # 1.0s silence
+        {"text": "gamma", "start": 3.0, "end": 3.4, "type": "word"},
+        {"text": "delta", "start": 3.4, "end": 4.0, "type": "word"},
+    ])
+    words = rec.load_transcript_words(transcript)
+    candidates = rec.build_candidates(words, gap_threshold=0.5)
+    assert len(candidates) == 2
+    assert abs(candidates[0].start - 1.0) < 1e-6 and abs(candidates[0].end - 2.0) < 1e-6
+    assert abs(candidates[1].start - 3.0) < 1e-6 and abs(candidates[1].end - 4.0) < 1e-6
+    # Tightening the gap shouldn't merge them (still well over threshold)
+    # Loosening past 1.0s should:
+    merged = rec.build_candidates(words, gap_threshold=1.1)
+    assert len(merged) == 1
+
+
+# ---------------------------------------------------------------------------
+# 5. Low-score match emits warning
+# ---------------------------------------------------------------------------
+
+
+def test_low_score_warning(rec, helpers_ns, tmp_path):
+    srt = tmp_path / "script.srt"
+    transcript = tmp_path / "transcript.json"
+    # Cue text shares almost no tokens with any candidate
+    helpers_ns.write_srt(srt, [
+        (1, 0.0, 2.0, "quantum entanglement decoherence"),
+    ])
+    write_transcript(transcript, [
+        {"text": "apple",  "start": 1.0, "end": 1.5, "type": "word"},
+        {"text": "banana", "start": 1.5, "end": 2.0, "type": "word"},
+    ])
+    assignments = rec.recommend(
+        script_srt=srt, transcript=transcript, source=Path("fake.mp4"),
+        output=tmp_path / "plan.json",
+        min_score=0.5,  # set high to force the warning
+    )
+    a = assignments[0]
+    assert a.cand is not None  # still got SOME candidate
+    assert any("low score" in w for w in a.warnings)
+
+
+# ---------------------------------------------------------------------------
+# 6. SRT id ordering preserved in output
+# ---------------------------------------------------------------------------
+
+
+def test_ids_preserved(rec, helpers_ns, tmp_path):
+    srt = tmp_path / "script.srt"
+    transcript = tmp_path / "transcript.json"
+    helpers_ns.write_srt(srt, [
+        (1, 0.0, 1.0, "alpha"),
+        (2, 1.0, 2.0, "beta"),
+        (3, 2.0, 3.0, "gamma"),
+    ])
+    write_transcript(transcript, [
+        {"text": "alpha.", "start": 1.0, "end": 1.5, "type": "word"},
+        {"text": "beta.",  "start": 5.0, "end": 5.5, "type": "word"},
+        {"text": "gamma.", "start": 10.0, "end": 10.5, "type": "word"},
+    ])
+    out = tmp_path / "plan.json"
+    rec.recommend(
+        script_srt=srt, transcript=transcript, source=Path("fake.mp4"),
+        output=out,
+    )
+    plan_rows = json.loads(out.read_text(encoding="utf-8"))
+    assert [r["id"] for r in plan_rows] == [1, 2, 3]
+
+
+# ---------------------------------------------------------------------------
+# 7. Output is parseable by srt_driven_edit.parse_plan
+# ---------------------------------------------------------------------------
+
+
+def test_output_is_parseable_by_sde(rec, sde, helpers_ns, tmp_path):
+    srt = tmp_path / "script.srt"
+    transcript = tmp_path / "transcript.json"
+    helpers_ns.write_srt(srt, [
+        (1, 0.0, 1.0, "alpha"),
+        (2, 1.0, 2.0, "beta"),
+    ])
+    write_transcript(transcript, [
+        {"text": "alpha.", "start": 1.0, "end": 1.5, "type": "word"},
+        {"text": "beta.",  "start": 5.0, "end": 5.5, "type": "word"},
+    ])
+    out = tmp_path / "plan.json"
+    rec.recommend(
+        script_srt=srt, transcript=transcript, source=Path("fake.mp4"),
+        output=out,
+    )
+
+    sources, voices, entries = sde.parse_plan(out)
+    assert sources == {} and voices == {}  # Form A — no maps
+    assert [e.id for e in entries] == [1, 2]
+    assert all(e.source_name == "_default" for e in entries)
+    assert entries[0].source_start == 1.0 and entries[0].source_end == 1.5
+    assert entries[1].source_start == 5.0 and entries[1].source_end == 5.5
+
+
+# ---------------------------------------------------------------------------
+# 8. Form B output carries the source name
+# ---------------------------------------------------------------------------
+
+
+def test_form_b_output(rec, sde, helpers_ns, tmp_path):
+    srt = tmp_path / "script.srt"
+    transcript = tmp_path / "transcript.json"
+    helpers_ns.write_srt(srt, [(1, 0.0, 1.0, "alpha")])
+    write_transcript(transcript, [
+        {"text": "alpha.", "start": 1.0, "end": 1.5, "type": "word"},
+    ])
+    out = tmp_path / "plan.json"
+    rec.recommend(
+        script_srt=srt, transcript=transcript,
+        source=tmp_path / "src.mp4", source_name="TAKE_A",
+        output_format="form-b", output=out,
+    )
+    data = json.loads(out.read_text(encoding="utf-8"))
+    assert "TAKE_A" in data["sources"]
+    assert data["segments"][0]["source"] == "TAKE_A"
+    # And it's parseable by sde.parse_plan too
+    sources, _, entries = sde.parse_plan(out)
+    assert "TAKE_A" in sources
+    assert entries[0].source_name == "TAKE_A"
+
+
+# ---------------------------------------------------------------------------
+# 9. No candidates → hard fail (per spec)
+# ---------------------------------------------------------------------------
+
+
+def test_no_candidates_aborts(rec, helpers_ns, tmp_path):
+    srt = tmp_path / "script.srt"
+    transcript = tmp_path / "transcript.json"
+    helpers_ns.write_srt(srt, [(1, 0.0, 1.0, "alpha")])
+    # Transcript has only audio_event (no words)
+    write_transcript(transcript, [
+        {"text": "(laughter)", "start": 1.0, "end": 2.0, "type": "audio_event"},
+    ])
+    with pytest.raises(SystemExit):
+        rec.recommend(
+            script_srt=srt, transcript=transcript, source=Path("fake.mp4"),
+            output=tmp_path / "plan.json",
+        )
+
+
+# ---------------------------------------------------------------------------
+# 10. Review markdown shows score + warnings
+# ---------------------------------------------------------------------------
+
+
+def test_review_markdown_content(rec, helpers_ns, tmp_path):
+    srt = tmp_path / "script.srt"
+    transcript = tmp_path / "transcript.json"
+    helpers_ns.write_srt(srt, [(1, 0.0, 2.0, "Hello world")])
+    write_transcript(transcript, [
+        {"text": "Hello",  "start": 1.0, "end": 1.5, "type": "word"},
+        {"text": "world.", "start": 1.5, "end": 2.0, "type": "word"},
+    ])
+    out = tmp_path / "plan.json"
+    rec.recommend(
+        script_srt=srt, transcript=transcript, source=Path("fake.mp4"),
+        output=out,
+    )
+    review = (out.with_name(out.stem + "_review.md")).read_text(encoding="utf-8")
+    assert "cue id=1" in review
+    assert "Hello world" in review
+    assert "**score**" in review
+    assert "**source range**" in review
+
+
+# ---------------------------------------------------------------------------
+# 11. End-to-end: recommend → run_job → final mp4 exists
+# ---------------------------------------------------------------------------
+
+
+def test_backward_source_jump_warns_by_default(rec, helpers_ns, tmp_path):
+    """When a later cue matches an earlier source position, a warning fires.
+
+    Two cues both want a line that appears twice in the source. Greedy
+    matching with no constraint picks the EARLIEST instance for the
+    earlier-ID cue (because Jaccard score breaks ties by first hit), then
+    the SECOND instance for the later cue — so source time advances and
+    no warning. We construct the inverse: make the earlier-ID cue prefer
+    the LATER instance (longer duration → better duration_similarity),
+    leaving only the earlier instance for the later cue, producing a
+    backward jump that must be flagged.
+    """
+    srt = tmp_path / "script.srt"
+    transcript = tmp_path / "transcript.json"
+    # Cue 1 prefers a 2.0s match; cue 2 prefers a 1.0s match.
+    helpers_ns.write_srt(srt, [
+        (1, 0.0, 2.0, "alpha alpha alpha"),
+        (2, 2.0, 3.0, "alpha alpha alpha"),
+    ])
+    write_transcript(transcript, [
+        # Early instance: 1.0s duration → cue 2 will prefer it
+        {"text": "alpha",  "start": 5.0, "end": 5.4, "type": "word"},
+        {"text": "alpha",  "start": 5.4, "end": 5.7, "type": "word"},
+        {"text": "alpha.", "start": 5.7, "end": 6.0, "type": "word"},
+        # Late instance: 2.0s duration → cue 1 will prefer it
+        {"text": "alpha",  "start": 20.0, "end": 20.7, "type": "word"},
+        {"text": "alpha",  "start": 20.7, "end": 21.4, "type": "word"},
+        {"text": "alpha.", "start": 21.4, "end": 22.0, "type": "word"},
+    ])
+
+    assignments = rec.recommend(
+        script_srt=srt, transcript=transcript, source=Path("fake.mp4"),
+        output=tmp_path / "plan.json",
+    )
+    # Cue 1 picked the 2s late instance, cue 2 picked the 1s early one → backward
+    assert assignments[0].cand.start >= 20.0
+    assert assignments[1].cand.start <= 6.0
+    backward_warnings = [w for w in assignments[1].warnings
+                         if "backward" in w]
+    assert backward_warnings, \
+        f"expected a backward-time warning on cue 2, got: {assignments[1].warnings}"
+
+
+def test_monotonic_source_prevents_backward_jump(rec, helpers_ns, tmp_path):
+    """With --monotonic-source the same setup must NOT pick the early
+    instance for cue 2. The constraint forces cue 2's candidate to start
+    at or after cue 1's end."""
+    srt = tmp_path / "script.srt"
+    transcript = tmp_path / "transcript.json"
+    helpers_ns.write_srt(srt, [
+        (1, 0.0, 2.0, "alpha alpha alpha"),
+        (2, 2.0, 3.0, "alpha alpha alpha"),
+    ])
+    # Same as the previous test PLUS a third late instance so cue 2 has a
+    # forward option under the constraint.
+    write_transcript(transcript, [
+        {"text": "alpha",  "start": 5.0, "end": 5.4, "type": "word"},
+        {"text": "alpha",  "start": 5.4, "end": 5.7, "type": "word"},
+        {"text": "alpha.", "start": 5.7, "end": 6.0, "type": "word"},
+        {"text": "alpha",  "start": 20.0, "end": 20.7, "type": "word"},
+        {"text": "alpha",  "start": 20.7, "end": 21.4, "type": "word"},
+        {"text": "alpha.", "start": 21.4, "end": 22.0, "type": "word"},
+        {"text": "alpha",  "start": 30.0, "end": 30.4, "type": "word"},
+        {"text": "alpha",  "start": 30.4, "end": 30.7, "type": "word"},
+        {"text": "alpha.", "start": 30.7, "end": 31.0, "type": "word"},
+    ])
+
+    assignments = rec.recommend(
+        script_srt=srt, transcript=transcript, source=Path("fake.mp4"),
+        output=tmp_path / "plan.json",
+        monotonic_source=True,
+    )
+    assert assignments[0].cand.start >= 20.0
+    assert assignments[1].cand.start >= assignments[0].cand.end - 1e-6, \
+        "cue 2's candidate must start at or after cue 1's end under monotonic"
+    # No backward warning under monotonic mode
+    assert not any("backward" in w for w in assignments[1].warnings)
+
+
+def test_max_source_gap_warning(rec, helpers_ns, tmp_path):
+    """--max-source-gap fires a warning when the gap exceeds the threshold."""
+    srt = tmp_path / "script.srt"
+    transcript = tmp_path / "transcript.json"
+    helpers_ns.write_srt(srt, [
+        (1, 0.0, 1.0, "alpha"),
+        (2, 1.0, 2.0, "beta"),
+    ])
+    write_transcript(transcript, [
+        {"text": "alpha.", "start": 1.0, "end": 1.5, "type": "word"},
+        # Big gap to next: beta is at 60+ seconds away
+        {"text": "beta.",  "start": 65.0, "end": 65.5, "type": "word"},
+    ])
+    assignments = rec.recommend(
+        script_srt=srt, transcript=transcript, source=Path("fake.mp4"),
+        output=tmp_path / "plan.json",
+        max_source_gap_warn=10.0,  # gap is ~63.5s, well over 10s
+    )
+    jump_warnings = [w for w in assignments[1].warnings
+                     if "source-time jump" in w]
+    assert jump_warnings, \
+        f"expected a big-gap warning, got: {assignments[1].warnings}"
+
+
+def test_monotonic_with_no_forward_candidate_fails(rec, helpers_ns, tmp_path):
+    """If no candidate can satisfy the monotonic constraint, the cue gets
+    the 'no candidate available at or after ...' warning and write_plan
+    hard-fails (per the no-candidate contract)."""
+    srt = tmp_path / "script.srt"
+    transcript = tmp_path / "transcript.json"
+    helpers_ns.write_srt(srt, [
+        (1, 0.0, 1.0, "alpha"),
+        (2, 1.0, 2.0, "beta"),
+    ])
+    write_transcript(transcript, [
+        # alpha matches at 20s, taking cue 1
+        {"text": "alpha.", "start": 20.0, "end": 21.0, "type": "word"},
+        # beta only available BEFORE alpha — monotonic can't reach it
+        {"text": "beta.",  "start": 5.0,  "end": 6.0,  "type": "word"},
+    ])
+    with pytest.raises(SystemExit):
+        rec.recommend(
+            script_srt=srt, transcript=transcript, source=Path("fake.mp4"),
+            output=tmp_path / "plan.json",
+            monotonic_source=True,
+        )
+
+
+def test_e2e_recommend_then_render(
+    rec, sde, helpers_ns, synth_av, tmp_path
+):
+    """Full chain: fabricated transcript → recommend → run_job → final.mp4."""
+    srt = tmp_path / "script.srt"
+    transcript = tmp_path / "transcript.json"
+    plan = tmp_path / "plan.json"
+    out_mp4 = tmp_path / "final.mp4"
+
+    # 3 cues totaling 6s of output
+    helpers_ns.write_srt(srt, [
+        (1, 0.0, 2.0, "alpha beta"),
+        (2, 2.0, 4.0, "gamma delta"),
+        (3, 4.0, 6.0, "epsilon zeta"),
+    ])
+    # Transcript: words that match each cue at distinct, valid times in synth_av (30s)
+    # Each candidate is exactly 2s — matches cue duration exactly so no on-short needed.
+    write_transcript(transcript, [
+        {"text": "alpha",   "start": 1.0, "end": 1.8, "type": "word"},
+        {"text": "beta.",   "start": 1.8, "end": 3.0, "type": "word"},
+        # silence gap
+        {"text": "gamma",   "start": 8.0, "end": 8.8, "type": "word"},
+        {"text": "delta.",  "start": 8.8, "end": 10.0, "type": "word"},
+        # silence gap
+        {"text": "epsilon", "start": 18.0, "end": 18.8, "type": "word"},
+        {"text": "zeta.",   "start": 18.8, "end": 20.0, "type": "word"},
+    ])
+
+    assignments = rec.recommend(
+        script_srt=srt, transcript=transcript, source=synth_av,
+        output=plan,
+    )
+    assert len(assignments) == 3
+    assert all(a.cand is not None for a in assignments)
+
+    # Render via the existing pipeline
+    ffmpeg_version = sde.preflight()["ffmpeg"]
+    job = sde.Job(
+        source=synth_av,
+        srt=srt, plan=plan,
+        voice=None, bg_volume=0.0,
+        tolerance=0.5, trim_direction="tail", on_short="error",
+        style="auto", fontsdir=None,
+        output=out_mp4,
+        name="e2e",
+        no_cache=False, keep_intermediates=False, no_overwrite=False,
+    )
+    qc = sde.run_job(job, ffmpeg_version)
+    assert qc["ok"] is True
+    assert out_mp4.exists()
+    assert abs(qc["duration"]["drift_ms"]) <= 200
diff --git a/tests/test_run_episodes.py b/tests/test_run_episodes.py
new file mode 100644
index 0000000..612b523
--- /dev/null
+++ b/tests/test_run_episodes.py
@@ -0,0 +1,251 @@
+"""Tests for the multi-episode batch runner."""
+
+from __future__ import annotations
+
+import shutil
+from pathlib import Path
+
+import pytest
+
+
+@pytest.fixture
+def runner():
+    import run_episodes
+    return run_episodes
+
+
+@pytest.fixture
+def ffmpeg_version(helpers_ns):
+    return helpers_ns.sde.preflight()["ffmpeg"]
+
+
+def _make_ep(ep_dir: Path, source: Path, helpers_ns, *,
+             cues=None, plan=None, voice: Path | None = None) -> None:
+    ep_dir.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(source, ep_dir / "source.mp4")
+    helpers_ns.write_srt(ep_dir / "script.srt", cues or [
+        (1, 0.0, 1.5, "alpha"),
+        (2, 1.5, 3.0, "beta"),
+    ])
+    helpers_ns.write_plan_form_a(ep_dir / "edit_plan.json", plan or [
+        (1, 1.0, 2.5),
+        (2, 5.0, 6.5),
+    ])
+    if voice is not None:
+        shutil.copy2(voice, ep_dir / "voice.wav")
+
+
+# ---------------------------------------------------------------------------
+# 1. Discovery: pick up complete dirs, skip incomplete ones
+# ---------------------------------------------------------------------------
+
+
+def test_discover_skips_incomplete_dirs(runner, helpers_ns, synth_av, tmp_path):
+    batch = tmp_path / "batch"
+    _make_ep(batch / "ep01", synth_av, helpers_ns)
+    _make_ep(batch / "ep02", synth_av, helpers_ns)
+    # incomplete: missing edit_plan.json
+    bad = batch / "ep03"
+    bad.mkdir(parents=True)
+    shutil.copy2(synth_av, bad / "source.mp4")
+    helpers_ns.write_srt(bad / "script.srt", [(1, 0.0, 1.5, "x")])
+    # not a dir at all
+    (batch / "stray.txt").write_text("ignore me", encoding="utf-8")
+
+    eps = runner.discover_episodes(batch)
+    names = [e.name for e in eps]
+    assert names == ["ep01", "ep02"]
+
+
+def test_discover_sees_voice_wav_if_present(
+    runner, helpers_ns, synth_av, synth_voice, tmp_path
+):
+    batch = tmp_path / "batch"
+    _make_ep(batch / "ep01", synth_av, helpers_ns)
+    _make_ep(batch / "ep02", synth_av, helpers_ns, voice=synth_voice)
+
+    eps = runner.discover_episodes(batch)
+    by_name = {e.name: e for e in eps}
+    assert by_name["ep01"].voice is None
+    assert by_name["ep02"].voice is not None and by_name["ep02"].voice.is_file()
+
+
+def test_discover_hard_fails_on_empty_root(runner, tmp_path):
+    batch = tmp_path / "empty"
+    batch.mkdir()
+    with pytest.raises(SystemExit) as exc:
+        runner.discover_episodes(batch)
+    assert "no usable" in str(exc.value)
+
+
+# ---------------------------------------------------------------------------
+# 2. End-to-end: 3 eps run sequentially, each produces final.mp4
+# ---------------------------------------------------------------------------
+
+
+def test_run_episodes_e2e(runner, helpers_ns, ffmpeg_version, synth_av, tmp_path):
+    batch = tmp_path / "batch"
+    for name in ("ep01", "ep02", "ep03"):
+        _make_ep(batch / name, synth_av, helpers_ns)
+
+    summary = runner.run_episodes(batch, ffmpeg_version=ffmpeg_version)
+
+    assert summary["episodes_total"] == 3
+    assert summary["ok"] == 3
+    for name in ("ep01", "ep02", "ep03"):
+        final = batch / name / "final.mp4"
+        assert final.exists(), f"{name}/final.mp4 missing"
+
+    # Summary artifact
+    summary_file = batch / "run_episodes_summary.json"
+    assert summary_file.exists()
+
+
+# ---------------------------------------------------------------------------
+# 3. continue-on-error skips a broken ep, finishes the rest
+# ---------------------------------------------------------------------------
+
+
+def test_run_episodes_continue_on_error(
+    runner, helpers_ns, ffmpeg_version, synth_av, tmp_path
+):
+    batch = tmp_path / "batch"
+    _make_ep(batch / "ep01", synth_av, helpers_ns)
+    # ep02: range exceeds the synth source (30s) — pre-extract range check fires
+    _make_ep(batch / "ep02", synth_av, helpers_ns,
+             plan=[(1, 1.0, 2.5), (2, 60.0, 61.5)])
+    _make_ep(batch / "ep03", synth_av, helpers_ns)
+
+    summary = runner.run_episodes(
+        batch, ffmpeg_version=ffmpeg_version,
+        continue_on_error=True,
+    )
+    assert summary["episodes_total"] == 3
+    assert summary["ok"] == 2
+    # ep01 + ep03 produced output, ep02 did not
+    assert (batch / "ep01" / "final.mp4").exists()
+    assert not (batch / "ep02" / "final.mp4").exists()
+    assert (batch / "ep03" / "final.mp4").exists()
+
+
+def test_run_episodes_aborts_without_continue_on_error(
+    runner, helpers_ns, ffmpeg_version, synth_av, tmp_path
+):
+    batch = tmp_path / "batch"
+    _make_ep(batch / "ep01", synth_av, helpers_ns)
+    _make_ep(batch / "ep02", synth_av, helpers_ns,
+             plan=[(1, 1.0, 2.5), (2, 60.0, 61.5)])  # bad
+    _make_ep(batch / "ep03", synth_av, helpers_ns)
+
+    with pytest.raises(SystemExit):
+        runner.run_episodes(batch, ffmpeg_version=ffmpeg_version)
+    # ep03 was never reached
+    assert not (batch / "ep03" / "final.mp4").exists()
+
+
+# ---------------------------------------------------------------------------
+# 4. Per-ep voice.wav becomes a global voice for that ep
+# ---------------------------------------------------------------------------
+
+
+def test_run_episodes_failure_record_includes_paths(
+    runner, helpers_ns, ffmpeg_version, synth_av, tmp_path
+):
+    """When --continue-on-error skips an ep, the record must carry enough
+    context to triage without re-reading the terminal."""
+    batch = tmp_path / "batch"
+    _make_ep(batch / "ep01", synth_av, helpers_ns)
+    _make_ep(batch / "ep02", synth_av, helpers_ns,
+             plan=[(1, 1.0, 2.5), (2, 60.0, 61.5)])  # range overruns 30s synth
+    _make_ep(batch / "ep03", synth_av, helpers_ns)
+
+    summary = runner.run_episodes(
+        batch, ffmpeg_version=ffmpeg_version,
+        continue_on_error=True,
+    )
+    failed = [r for r in summary["results"] if not r.get("ok")]
+    assert len(failed) == 1
+    rec = failed[0]
+    assert rec["job"] == "ep02"
+    assert rec["index"] == 1
+    assert rec["srt"].endswith("script.srt")
+    assert rec["plan"].endswith("edit_plan.json")
+    assert rec["source"].endswith("source.mp4")
+    assert rec["output"].endswith("final.mp4")
+    assert rec["error"]
+    # Pre-extract range-bounds check → no ffmpeg → empty stderr
+    assert rec["stderr_tail"] == ""
+
+
+def test_run_episodes_continues_past_corrupt_plan_json(
+    runner, helpers_ns, ffmpeg_version, synth_av, tmp_path
+):
+    """A non-SystemExit (JSONDecodeError) inside run_job must NOT abort
+    --continue-on-error. Pre-fix the loop only caught SystemExit, so a
+    malformed edit_plan.json in one ep would crash the whole batch."""
+    batch = tmp_path / "batch"
+    _make_ep(batch / "ep01", synth_av, helpers_ns)
+    # ep02: valid SRT + source, but plan.json is garbage
+    ep02 = batch / "ep02"
+    ep02.mkdir()
+    import shutil
+    shutil.copy2(synth_av, ep02 / "source.mp4")
+    helpers_ns.write_srt(ep02 / "script.srt", [(1, 0.0, 1.5, "x")])
+    (ep02 / "edit_plan.json").write_text("{ not json", encoding="utf-8")
+    # ep03 should still run
+    _make_ep(batch / "ep03", synth_av, helpers_ns)
+
+    summary = runner.run_episodes(
+        batch, ffmpeg_version=ffmpeg_version,
+        continue_on_error=True,
+    )
+    assert summary["episodes_total"] == 3
+    assert summary["ok"] == 2
+
+    failed = [r for r in summary["results"] if not r.get("ok")]
+    assert len(failed) == 1 and failed[0]["job"] == "ep02"
+    assert "JSON" in failed[0]["error"] or "json" in failed[0]["error"]
+    # ep03 (the post-bad one) must have run
+    assert (batch / "ep03" / "final.mp4").exists()
+
+
+def test_run_episodes_extract_mode(
+    runner, helpers_ns, ffmpeg_version, synth_av, tmp_path
+):
+    """--mode extract across multiple eps: each ep produces clip_*.mp4 in its
+    own edit/extracted_clips_<ep>/ and NOT a final.mp4."""
+    batch = tmp_path / "batch"
+    for name in ("ep01", "ep02"):
+        _make_ep(batch / name, synth_av, helpers_ns)
+
+    summary = runner.run_episodes(
+        batch, ffmpeg_version=ffmpeg_version, mode="extract",
+    )
+
+    assert summary["episodes_total"] == 2
+    assert summary["ok"] == 2
+    for r in summary["results"]:
+        assert r["mode"] == "extract"
+        assert r["clip_count"] == 2  # CUES_2 has 2 cues
+        extracted_dir = Path(r["extracted_dir"])
+        assert extracted_dir.is_dir()
+        assert (extracted_dir / "clip_001.mp4").exists()
+        assert (extracted_dir / "clip_002.mp4").exists()
+    # No final.mp4 in any ep dir
+    for name in ("ep01", "ep02"):
+        assert not (batch / name / "final.mp4").exists()
+
+
+def test_run_episodes_per_ep_voice(
+    runner, helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path
+):
+    batch = tmp_path / "batch"
+    _make_ep(batch / "ep01", synth_av, helpers_ns)
+    _make_ep(batch / "ep02", synth_av, helpers_ns, voice=synth_voice)
+
+    summary = runner.run_episodes(batch, ffmpeg_version=ffmpeg_version)
+
+    by_name = {r["job"]: r for r in summary["results"]}
+    assert by_name["ep01"]["audio"]["voice_used"] is False
+    assert by_name["ep02"]["audio"]["voice_used"] is True
+    assert by_name["ep02"]["audio"]["mode"] == "voice_replace"
diff --git a/tests/test_srt_driven_batch.py b/tests/test_srt_driven_batch.py
new file mode 100644
index 0000000..37733d5
--- /dev/null
+++ b/tests/test_srt_driven_batch.py
@@ -0,0 +1,325 @@
+"""Batch-manifest tests for srt_driven_edit.
+
+Exercises load_manifest + job_from_dict + run_job in the loop pattern that
+the CLI uses, without depending on argv parsing.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+import pytest
+
+
+# ---------------------------------------------------------------------------
+# Common cue/plan helpers used across batch jobs
+# ---------------------------------------------------------------------------
+
+CUES_2 = [
+    (1, 0.0, 2.0, "alpha"),
+    (2, 2.0, 4.0, "beta"),
+]
+
+PLAN_2 = [
+    (1, 1.0, 3.0),
+    (2, 5.0, 7.0),
+]
+
+
+def default_args_namespace() -> argparse.Namespace:
+    """Build the defaults Namespace job_from_dict expects."""
+    return argparse.Namespace(
+        bg_volume=0.0,
+        tolerance=0.5,
+        trim_direction="tail",
+        on_short="error",
+        style="auto",
+        no_cache=False,
+        keep_intermediates=False,
+        no_overwrite=False,
+    )
+
+
+def run_batch(helpers_ns, manifest_path, ffmpeg_version, *,
+              continue_on_error: bool = False) -> list[dict]:
+    """Mirror the CLI's batch loop so we can unit-test it."""
+    sde = helpers_ns.sde
+    defaults = default_args_namespace()
+    rows = sde.load_manifest(manifest_path)
+    results: list[dict] = []
+    for i, row in enumerate(rows):
+        try:
+            job = sde.job_from_dict(row, defaults, manifest_path.parent, i)
+        except (SystemExit, Exception) as e:
+            if continue_on_error:
+                results.append(sde.make_failure_record(
+                    index=i, name=row.get("name", f"row{i}"),
+                    error=e, job=None, manifest_row=row,
+                ))
+                continue
+            raise
+        try:
+            results.append(sde.run_job(job, ffmpeg_version))
+        except (SystemExit, Exception) as e:
+            if continue_on_error:
+                results.append(sde.make_failure_record(
+                    index=i, name=job.name, error=e, job=job,
+                ))
+                continue
+            raise
+    return results
+
+
+@pytest.fixture
+def ffmpeg_version(helpers_ns) -> str:
+    return helpers_ns.sde.preflight()["ffmpeg"]
+
+
+# ---------------------------------------------------------------------------
+# 1. Two jobs same name, no output specified → auto-isolated outputs
+# ---------------------------------------------------------------------------
+
+
+def test_batch_auto_isolation(helpers_ns, ffmpeg_version, synth_av, tmp_path):
+    # Two SRTs / plans with distinct content but identical job name
+    for i in range(2):
+        srt = tmp_path / f"script_{i}.srt"
+        plan = tmp_path / f"plan_{i}.json"
+        helpers_ns.write_srt(srt, CUES_2)
+        helpers_ns.write_plan_form_a(plan, PLAN_2)
+
+    manifest_path = tmp_path / "jobs.json"
+    manifest_path.write_text(json.dumps([
+        {"name": "promo",  # same name on purpose
+         "source": str(synth_av),
+         "srt": "script_0.srt",
+         "plan": "plan_0.json"},
+        {"name": "promo",  # collision
+         "source": str(synth_av),
+         "srt": "script_1.srt",
+         "plan": "plan_1.json"},
+    ]), encoding="utf-8")
+
+    results = run_batch(helpers_ns, manifest_path, ffmpeg_version)
+    assert len(results) == 2
+    assert all(r["ok"] for r in results)
+
+    out_paths = [Path(r["output_path"]) for r in results]
+    # auto-isolated → distinct
+    assert out_paths[0] != out_paths[1]
+    # Names should contain the index suffix
+    assert "_00" in out_paths[0].name
+    assert "_01" in out_paths[1].name
+    for p in out_paths:
+        assert p.exists()
+
+
+# ---------------------------------------------------------------------------
+# 2. continue-on-error skips a malformed row, finishes the rest
+# ---------------------------------------------------------------------------
+
+
+def test_batch_continue_on_error(helpers_ns, ffmpeg_version, synth_av, tmp_path):
+    # Three jobs: 0 ok, 1 has a missing 'plan' field, 2 ok
+    for i in (0, 2):
+        helpers_ns.write_srt(tmp_path / f"s{i}.srt", CUES_2)
+        helpers_ns.write_plan_form_a(tmp_path / f"p{i}.json", PLAN_2)
+
+    manifest_path = tmp_path / "jobs.json"
+    manifest_path.write_text(json.dumps([
+        {"name": "ok0", "source": str(synth_av),
+         "srt": "s0.srt", "plan": "p0.json"},
+        {"name": "broken", "source": str(synth_av),
+         "srt": "s_missing.srt"},  # no plan, srt also missing
+        {"name": "ok2", "source": str(synth_av),
+         "srt": "s2.srt", "plan": "p2.json"},
+    ]), encoding="utf-8")
+
+    results = run_batch(helpers_ns, manifest_path, ffmpeg_version,
+                        continue_on_error=True)
+    assert len(results) == 3
+    assert results[0]["ok"] is True
+    assert results[1]["ok"] is False and "error" in results[1]
+    assert results[2]["ok"] is True
+
+
+def test_batch_aborts_without_continue_on_error(
+    helpers_ns, ffmpeg_version, synth_av, tmp_path
+):
+    helpers_ns.write_srt(tmp_path / "s0.srt", CUES_2)
+    helpers_ns.write_plan_form_a(tmp_path / "p0.json", PLAN_2)
+
+    manifest_path = tmp_path / "jobs.json"
+    manifest_path.write_text(json.dumps([
+        {"name": "ok0", "source": str(synth_av),
+         "srt": "s0.srt", "plan": "p0.json"},
+        {"name": "broken", "source": str(synth_av),
+         "srt": "s_missing.srt"},  # no plan
+    ]), encoding="utf-8")
+
+    with pytest.raises(SystemExit):
+        run_batch(helpers_ns, manifest_path, ffmpeg_version,
+                  continue_on_error=False)
+
+
+# ---------------------------------------------------------------------------
+# 3. CSV manifest is supported
+# ---------------------------------------------------------------------------
+
+
+def test_batch_csv_manifest(helpers_ns, ffmpeg_version, synth_av, tmp_path):
+    helpers_ns.write_srt(tmp_path / "s.srt", CUES_2)
+    helpers_ns.write_plan_form_a(tmp_path / "p.json", PLAN_2)
+
+    manifest = tmp_path / "jobs.csv"
+    manifest.write_text(
+        "name,source,srt,plan,bg_volume\n"
+        f"promo,{synth_av},s.srt,p.json,0.0\n",
+        encoding="utf-8",
+    )
+    results = run_batch(helpers_ns, manifest, ffmpeg_version)
+    assert len(results) == 1 and results[0]["ok"] is True
+
+
+# ---------------------------------------------------------------------------
+# 4. Different bg_volume per job is honored (cache must NOT collide)
+# ---------------------------------------------------------------------------
+
+
+def test_run_ff_raises_pipeline_error_with_stderr(helpers_ns, tmp_path):
+    """run_ff must raise PipelineError carrying a non-empty stderr tail."""
+    sde = helpers_ns.sde
+    out = tmp_path / "out.mp4"
+    bogus = tmp_path / "definitely_missing.mp4"
+    with pytest.raises(sde.PipelineError) as exc:
+        sde.run_ff(
+            ["ffmpeg", "-y", "-hide_banner", "-i", str(bogus), str(out)],
+            "intentional failure",
+        )
+    # Subclass of SystemExit → existing handlers keep working
+    assert isinstance(exc.value, SystemExit)
+    assert exc.value.stderr_tail, "stderr_tail should be populated on ffmpeg failure"
+    # The stderr from ffmpeg complaining about a missing input should mention it
+    assert "definitely_missing.mp4" in exc.value.stderr_tail \
+        or "No such file" in exc.value.stderr_tail
+
+
+def test_batch_failure_record_includes_paths(
+    helpers_ns, ffmpeg_version, synth_av, tmp_path
+):
+    """A failed batch row must carry index/srt/plan/source/output for triage."""
+    helpers_ns.write_srt(tmp_path / "s_ok.srt", CUES_2)
+    helpers_ns.write_plan_form_a(tmp_path / "p_ok.json", PLAN_2)
+    helpers_ns.write_srt(tmp_path / "s_bad.srt", CUES_2)
+    # out-of-bounds range (synth_av is 30s; 60s exceeds it) — fails in pre-flight,
+    # no ffmpeg invocation → stderr_tail should stay empty.
+    helpers_ns.write_plan_form_a(tmp_path / "p_bad.json",
+                                  [(1, 1.0, 3.0), (2, 60.0, 62.0)])
+
+    manifest_path = tmp_path / "jobs.json"
+    manifest_path.write_text(json.dumps([
+        {"name": "ok",  "source": str(synth_av),
+         "srt": "s_ok.srt",  "plan": "p_ok.json"},
+        {"name": "bad", "source": str(synth_av),
+         "srt": "s_bad.srt", "plan": "p_bad.json"},
+    ]), encoding="utf-8")
+
+    results = run_batch(helpers_ns, manifest_path, ffmpeg_version,
+                        continue_on_error=True)
+    assert len(results) == 2 and results[0]["ok"] is True
+    failed = results[1]
+    assert failed["ok"] is False
+    assert failed["job"] == "bad"
+    assert failed["index"] == 1
+    assert failed["srt"] and failed["srt"].endswith("s_bad.srt")
+    assert failed["plan"] and failed["plan"].endswith("p_bad.json")
+    assert failed["source"] == str(synth_av)
+    assert failed["output"] and failed["output"].endswith(".mp4")
+    assert failed["error"]
+    # Range-bounds check fires before any ffmpeg → no stderr
+    assert failed["stderr_tail"] == ""
+
+
+def test_batch_malformed_row_failure_record(helpers_ns, ffmpeg_version, tmp_path):
+    """A row that fails inside job_from_dict still gets a usable record.
+
+    No Job was ever constructed, so paths come from the raw manifest row.
+    """
+    manifest_path = tmp_path / "jobs.json"
+    manifest_path.write_text(json.dumps([
+        {"name": "broken",
+         "source": "raw/take.mp4",
+         "srt":    "scripts/missing.srt"},  # no `plan` field
+    ]), encoding="utf-8")
+
+    results = run_batch(helpers_ns, manifest_path, ffmpeg_version,
+                        continue_on_error=True)
+    assert len(results) == 1
+    failed = results[0]
+    assert failed["ok"] is False
+    assert failed["job"] == "broken"
+    assert failed["index"] == 0
+    # Source / SRT come from the row dict because Job construction never completed
+    assert failed["source"] == "raw/take.mp4"
+    assert failed["srt"] == "scripts/missing.srt"
+    assert failed["plan"] is None
+    assert failed["stderr_tail"] == ""
+
+
+def test_batch_continues_past_corrupt_plan_json(
+    helpers_ns, ffmpeg_version, synth_av, tmp_path
+):
+    """A row whose plan.json is malformed must NOT abort the batch under
+    --continue-on-error. JSONDecodeError used to escape the loop because
+    we only caught SystemExit; the failure record now captures it.
+    """
+    # Good row
+    helpers_ns.write_srt(tmp_path / "s_ok.srt", CUES_2)
+    helpers_ns.write_plan_form_a(tmp_path / "p_ok.json", PLAN_2)
+    # Bad plan: not valid JSON
+    helpers_ns.write_srt(tmp_path / "s_bad.srt", CUES_2)
+    (tmp_path / "p_bad.json").write_text("{ this is not json", encoding="utf-8")
+    # Another good row after the bad one — must still run
+    helpers_ns.write_srt(tmp_path / "s_ok2.srt", CUES_2)
+    helpers_ns.write_plan_form_a(tmp_path / "p_ok2.json", PLAN_2)
+
+    manifest_path = tmp_path / "jobs.json"
+    manifest_path.write_text(json.dumps([
+        {"name": "ok0",    "source": str(synth_av),
+         "srt": "s_ok.srt",   "plan": "p_ok.json"},
+        {"name": "broken", "source": str(synth_av),
+         "srt": "s_bad.srt",  "plan": "p_bad.json"},
+        {"name": "ok2",    "source": str(synth_av),
+         "srt": "s_ok2.srt",  "plan": "p_ok2.json"},
+    ]), encoding="utf-8")
+
+    results = run_batch(helpers_ns, manifest_path, ffmpeg_version,
+                        continue_on_error=True)
+    assert len(results) == 3
+    assert results[0]["ok"] is True
+    assert results[1]["ok"] is False
+    assert "JSON" in results[1]["error"] or "json" in results[1]["error"]
+    assert results[1]["plan"] and results[1]["plan"].endswith("p_bad.json")
+    assert results[2]["ok"] is True
+
+
+def test_batch_per_job_bg_volume(helpers_ns, ffmpeg_version, synth_av, tmp_path):
+    helpers_ns.write_srt(tmp_path / "s.srt", CUES_2)
+    helpers_ns.write_plan_form_a(tmp_path / "p.json", PLAN_2)
+
+    manifest = tmp_path / "jobs.json"
+    manifest.write_text(json.dumps([
+        {"name": "silent", "source": str(synth_av),
+         "srt": "s.srt", "plan": "p.json", "bg_volume": 0.0},
+        {"name": "bg10", "source": str(synth_av),
+         "srt": "s.srt", "plan": "p.json", "bg_volume": 0.1},
+    ]), encoding="utf-8")
+
+    results = run_batch(helpers_ns, manifest, ffmpeg_version)
+    assert len(results) == 2 and all(r["ok"] for r in results)
+    assert results[0]["audio"]["mode"] == "silent"
+    assert results[1]["audio"]["mode"] == "original_only"
+    # bg10 should NOT have hit cache from silent (different effective_bg → different key)
+    assert all(s["cached"] is False for s in results[1]["segments"])
diff --git a/tests/test_srt_driven_e2e.py b/tests/test_srt_driven_e2e.py
new file mode 100644
index 0000000..ea5f05e
--- /dev/null
+++ b/tests/test_srt_driven_e2e.py
@@ -0,0 +1,441 @@
+"""End-to-end tests for srt_driven_edit.
+
+Each test crafts an SRT + plan file inside tmp_path, runs run_job against
+the session-scoped synthetic source video, and verifies output existence,
+duration accuracy (within 200ms), and QC report contents.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pytest
+
+
+# ---------------------------------------------------------------------------
+# Test helpers
+# ---------------------------------------------------------------------------
+
+DEFAULT_CUES = [
+    (1, 0.0, 2.0, "first cue"),
+    (2, 2.0, 4.5, "second cue"),
+    (3, 6.0, 8.5, "third cue with leading gap"),  # 1.5s gap before this
+]
+
+DEFAULT_PLAN = [
+    (1, 1.0, 3.0),   # 2.0s from source[1.0-3.0]
+    (2, 5.0, 7.5),   # 2.5s
+    (3, 10.0, 12.5),  # 2.5s
+]
+
+
+def make_job(helpers_ns, srt_path, plan_path, tmp_path, *,
+             source=None, voice=None, bg_volume=0.0,
+             style="auto", no_overwrite=False, output=None,
+             mode="full"):
+    sde = helpers_ns.sde
+    return sde.Job(
+        source=source,
+        srt=srt_path,
+        plan=plan_path,
+        voice=voice,
+        bg_volume=bg_volume,
+        tolerance=0.5,
+        trim_direction="tail",
+        on_short="error",
+        style=style,
+        fontsdir=None,
+        output=output or (tmp_path / "out.mp4"),
+        name=srt_path.stem,
+        no_cache=False,
+        keep_intermediates=False,
+        no_overwrite=no_overwrite,
+        mode=mode,
+    )
+
+
+@pytest.fixture
+def ffmpeg_version(helpers_ns) -> str:
+    return helpers_ns.sde.preflight()["ffmpeg"]
+
+
+# ---------------------------------------------------------------------------
+# 1. Basic e2e: source.mp4 + 3 cues → final has expected duration
+# ---------------------------------------------------------------------------
+
+
+def test_basic_single_job(helpers_ns, ffmpeg_version, synth_av, tmp_path):
+    srt = tmp_path / "script.srt"
+    plan = tmp_path / "plan.json"
+    helpers_ns.write_srt(srt, DEFAULT_CUES)
+    helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN)
+
+    job = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av)
+    qc = helpers_ns.sde.run_job(job, ffmpeg_version)
+
+    assert qc["ok"] is True
+    assert qc["duration"]["expected_s"] == 8.5
+    assert abs(qc["duration"]["drift_ms"]) <= 200
+    assert (tmp_path / "out.mp4").exists()
+    assert qc["audio"]["mode"] == "silent"  # bg_volume=0, no voice
+
+
+# ---------------------------------------------------------------------------
+# 2. GB18030 SRT input — encoding fallback must let the pipeline complete
+# ---------------------------------------------------------------------------
+
+
+def test_gbk_srt_input(helpers_ns, ffmpeg_version, synth_av, tmp_path):
+    srt = tmp_path / "script_gbk.srt"
+    plan = tmp_path / "plan.json"
+    cjk_cues = [
+        (1, 0.0, 2.0, "第一条"),
+        (2, 2.0, 4.5, "第二条"),
+        (3, 6.0, 8.5, "第三条 含 gap"),
+    ]
+    helpers_ns.write_srt(srt, cjk_cues, encoding="gb18030")
+    helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN)
+
+    job = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av, style="auto")
+    qc = helpers_ns.sde.run_job(job, ffmpeg_version)
+
+    assert qc["ok"] is True
+    assert "Microsoft YaHei UI" in qc["subtitles"]["force_style"], \
+        "auto style should pick cjk-natural when SRT contains CJK"
+
+
+# ---------------------------------------------------------------------------
+# 3. CJK in output path — work_dir + ensure_safe_subs_path must save us
+# ---------------------------------------------------------------------------
+
+
+def test_cjk_in_output_path(helpers_ns, ffmpeg_version, synth_av, tmp_path):
+    cjk_dir = tmp_path / "中文 目录"
+    cjk_dir.mkdir()
+    srt = cjk_dir / "字幕.srt"
+    plan = cjk_dir / "plan.json"
+    helpers_ns.write_srt(srt, DEFAULT_CUES)
+    helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN)
+
+    out = cjk_dir / "成片.mp4"
+    job = make_job(helpers_ns, srt, plan, tmp_path,
+                   source=synth_av, output=out)
+    qc = helpers_ns.sde.run_job(job, ffmpeg_version)
+
+    assert qc["ok"] is True
+    assert out.exists()
+
+
+# ---------------------------------------------------------------------------
+# 4. Per-segment voice — audio.mode should reflect voice usage
+# ---------------------------------------------------------------------------
+
+
+def test_per_segment_voice(helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path):
+    srt = tmp_path / "script.srt"
+    plan = tmp_path / "plan.json"
+    helpers_ns.write_srt(srt, DEFAULT_CUES)
+
+    helpers_ns.write_plan_form_b(
+        plan,
+        sources={"A": str(synth_av)},
+        voices={"v1": str(synth_voice)},
+        segments=[
+            {"id": 1, "source": "A", "source_start": "00:00:01,000",
+             "source_end": "00:00:03,000", "voice": "v1"},
+            {"id": 2, "source": "A", "source_start": "00:00:05,000",
+             "source_end": "00:00:07,500"},
+            {"id": 3, "source": "A", "source_start": "00:00:10,000",
+             "source_end": "00:00:12,500"},
+        ],
+    )
+
+    job = make_job(helpers_ns, srt, plan, tmp_path)  # source=None — Form B
+    qc = helpers_ns.sde.run_job(job, ffmpeg_version)
+
+    assert qc["ok"] is True
+    assert qc["audio"]["voice_used"] is True
+    assert qc["audio"]["mode"] == "voice_replace"  # bg_volume == 0
+
+
+# ---------------------------------------------------------------------------
+# 5. Video-only source + bg_volume > 0 → auto-degrade, no crash
+# ---------------------------------------------------------------------------
+
+
+def test_video_only_source_with_bg_volume(
+    helpers_ns, ffmpeg_version, synth_v_only, tmp_path, capsys
+):
+    srt = tmp_path / "script.srt"
+    plan = tmp_path / "plan.json"
+    helpers_ns.write_srt(srt, DEFAULT_CUES)
+    helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN)
+
+    job = make_job(helpers_ns, srt, plan, tmp_path,
+                   source=synth_v_only, bg_volume=0.5)
+    qc = helpers_ns.sde.run_job(job, ffmpeg_version)
+
+    captured = capsys.readouterr()
+    assert "no audio track" in captured.out, \
+        "expected a WARNING about source having no audio"
+    assert qc["ok"] is True
+
+
+# ---------------------------------------------------------------------------
+# 6. Source range out of bounds → SystemExit before extraction
+# ---------------------------------------------------------------------------
+
+
+def test_range_out_of_bounds_fails_fast(
+    helpers_ns, ffmpeg_version, synth_av, tmp_path
+):
+    srt = tmp_path / "script.srt"
+    plan = tmp_path / "plan.json"
+    helpers_ns.write_srt(srt, DEFAULT_CUES)
+    # source is 30s, but ask for 0:50 — way over
+    helpers_ns.write_plan_form_a(plan, [
+        (1, 1.0, 3.0),
+        (2, 5.0, 7.5),
+        (3, 50.0, 52.5),  # bad
+    ])
+
+    job = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av)
+    with pytest.raises(SystemExit) as exc:
+        helpers_ns.sde.run_job(job, ffmpeg_version)
+    assert "exceeds source" in str(exc.value)
+    # And the failure happened pre-extract, so no out.mp4
+    assert not (tmp_path / "out.mp4").exists()
+
+
+# ---------------------------------------------------------------------------
+# 7. Second run hits cache for every segment
+# ---------------------------------------------------------------------------
+
+
+def test_cache_hit_on_rerun(helpers_ns, ffmpeg_version, synth_av, tmp_path):
+    srt = tmp_path / "script.srt"
+    plan = tmp_path / "plan.json"
+    helpers_ns.write_srt(srt, DEFAULT_CUES)
+    helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN)
+
+    job = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av)
+    qc1 = helpers_ns.sde.run_job(job, ffmpeg_version)
+    qc2 = helpers_ns.sde.run_job(job, ffmpeg_version)
+
+    assert all(s["cached"] is False for s in qc1["segments"])
+    assert all(s["cached"] is True for s in qc2["segments"])
+    # Cache hits should be measurably faster
+    assert qc2["elapsed_s"] <= qc1["elapsed_s"]
+
+
+# ---------------------------------------------------------------------------
+# 8. --no-overwrite refuses to clobber existing output
+# ---------------------------------------------------------------------------
+
+
+def test_no_overwrite_refuses(helpers_ns, ffmpeg_version, synth_av, tmp_path):
+    srt = tmp_path / "script.srt"
+    plan = tmp_path / "plan.json"
+    helpers_ns.write_srt(srt, DEFAULT_CUES)
+    helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN)
+
+    job1 = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av)
+    helpers_ns.sde.run_job(job1, ffmpeg_version)
+
+    job2 = make_job(helpers_ns, srt, plan, tmp_path,
+                    source=synth_av, no_overwrite=True)
+    with pytest.raises(SystemExit) as exc:
+        helpers_ns.sde.run_job(job2, ffmpeg_version)
+    assert "no-overwrite" in str(exc.value)
+
+
+# ---------------------------------------------------------------------------
+# 9. SRT gap → output duration includes the gap as black+silent
+# ---------------------------------------------------------------------------
+
+
+def test_global_voice_spans_timeline(
+    helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path
+):
+    """Global --voice must span the WHOLE output timeline, not restart per segment.
+
+    Regression: earlier implementation expanded --voice into a synthetic
+    per-segment voice on every entry, which made each segment apad/atrim
+    voice.wav from t=0 — so a 5s voice would replay at every cut. The fix
+    moves global-voice mixing into the final compose step where voice is
+    apad'd / atrim'd to total_duration once.
+    """
+    srt = tmp_path / "script.srt"
+    plan = tmp_path / "plan.json"
+    helpers_ns.write_srt(srt, DEFAULT_CUES)  # total 8.5s
+    helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN)
+
+    job = make_job(helpers_ns, srt, plan, tmp_path,
+                   source=synth_av, voice=synth_voice)
+    qc = helpers_ns.sde.run_job(job, ffmpeg_version)
+
+    assert qc["ok"] is True
+    assert qc["audio"]["voice_used"] is True
+    assert qc["audio"]["mode"] == "voice_replace"
+    assert qc["audio"]["bg_volume"] == 0.0
+    # Per-segment voice slot must be None — proves we are NOT smuggling the
+    # global voice in via the per-segment expansion hack.
+    assert all(s["voice"] is None for s in qc["segments"])
+
+    # Output duration matches SRT total (voice apad'd from 5s → 8.5s)
+    actual = helpers_ns.sde.probe_duration(tmp_path / "out.mp4")
+    assert abs(actual - 8.5) < 0.25, f"actual {actual}s vs expected 8.5s"
+
+
+def test_global_voice_with_bg_volume_mix(
+    helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path
+):
+    """With bg_volume>0 and global voice, base audio (source*bg) is mixed
+    under voice. The bg_volume is applied ONCE at extract; the final compose
+    must not re-scale it.
+    """
+    srt = tmp_path / "script.srt"
+    plan = tmp_path / "plan.json"
+    helpers_ns.write_srt(srt, DEFAULT_CUES)
+    helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN)
+
+    job = make_job(helpers_ns, srt, plan, tmp_path,
+                   source=synth_av, voice=synth_voice, bg_volume=0.1)
+    qc = helpers_ns.sde.run_job(job, ffmpeg_version)
+
+    assert qc["ok"] is True
+    assert qc["audio"]["mode"] == "voice_mix"
+    assert qc["audio"]["bg_volume"] == 0.1
+
+
+def test_global_voice_cache_independence(
+    helpers_ns, ffmpeg_version, synth_av, synth_voice, tmp_path
+):
+    """Segment cache must NOT depend on the global voice file. Running once
+    without voice then again with voice should reuse all segment caches —
+    voice gets mixed in the final pass, segments are identical.
+    """
+    srt = tmp_path / "script.srt"
+    plan = tmp_path / "plan.json"
+    helpers_ns.write_srt(srt, DEFAULT_CUES)
+    helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN)
+
+    job_no_voice = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av)
+    qc1 = helpers_ns.sde.run_job(job_no_voice, ffmpeg_version)
+
+    job_with_voice = make_job(
+        helpers_ns, srt, plan, tmp_path,
+        source=synth_av, voice=synth_voice,
+        output=tmp_path / "out_voiced.mp4",
+    )
+    qc2 = helpers_ns.sde.run_job(job_with_voice, ffmpeg_version)
+
+    assert all(s["cached"] is False for s in qc1["segments"]), \
+        "first run should not have cache hits"
+    assert all(s["cached"] is True for s in qc2["segments"]), \
+        "second run with global voice should hit segment cache — voice is " \
+        "mixed in the final pass, not baked into segments"
+
+
+def test_extract_mode_stops_after_clips(
+    helpers_ns, ffmpeg_version, synth_av, tmp_path
+):
+    """--mode extract must produce per-cue clips and NOT a concat'd final.mp4."""
+    srt = tmp_path / "script.srt"
+    plan = tmp_path / "plan.json"
+    helpers_ns.write_srt(srt, DEFAULT_CUES)
+    helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN)
+
+    job = make_job(helpers_ns, srt, plan, tmp_path,
+                   source=synth_av, mode="extract")
+    qc = helpers_ns.sde.run_job(job, ffmpeg_version)
+
+    # Extract-mode result shape differs from the QC report
+    assert qc["ok"] is True
+    assert qc["mode"] == "extract"
+    assert qc["clip_count"] == 3
+    extracted_dir = Path(qc["extracted_dir"])
+    assert extracted_dir.is_dir()
+
+    # Clips renamed to clip_<id:03d>.mp4 (matches srt_video_editor convention)
+    for cid in (1, 2, 3):
+        clip = extracted_dir / f"clip_{cid:03d}.mp4"
+        assert clip.is_file(), f"missing extracted clip: {clip}"
+        # Each clip should match its cue duration within encoder rounding
+        actual = helpers_ns.sde.probe_duration(clip)
+        expected = next(c for c in DEFAULT_CUES if c[0] == cid)
+        expected_dur = expected[2] - expected[1]
+        assert abs(actual - expected_dur) < 0.25, \
+            f"clip {cid}: actual {actual}s vs expected {expected_dur}s"
+
+    # And NO final.mp4 was produced — extract mode stopped early
+    assert not (tmp_path / "out.mp4").exists()
+
+
+def test_extract_mode_skips_gap_clips(
+    helpers_ns, ffmpeg_version, synth_av, tmp_path
+):
+    """In extract mode, the synthetic black+silence gap clips are not made —
+    only real source extractions land in extracted_clips_/."""
+    srt = tmp_path / "script.srt"
+    plan = tmp_path / "plan.json"
+    # Cues with a 1.5s gap between id=2 and id=3 (final_end=4.5, final_start=6.0)
+    helpers_ns.write_srt(srt, DEFAULT_CUES)
+    helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN)
+
+    job = make_job(helpers_ns, srt, plan, tmp_path,
+                   source=synth_av, mode="extract")
+    qc = helpers_ns.sde.run_job(job, ffmpeg_version)
+    extracted_dir = Path(qc["extracted_dir"])
+    # Only 3 clips (one per cue) — no gap_*.mp4 sneaks in
+    files = sorted(p.name for p in extracted_dir.iterdir())
+    assert files == ["clip_001.mp4", "clip_002.mp4", "clip_003.mp4"]
+
+
+def test_extract_mode_cleans_stale_clips(
+    helpers_ns, ffmpeg_version, synth_av, tmp_path
+):
+    """A previous extract-mode run's stale clips must be removed before this
+    run writes its own. Otherwise leftover clip_999.mp4 would pollute the dir.
+    """
+    srt = tmp_path / "script.srt"
+    plan = tmp_path / "plan.json"
+    helpers_ns.write_srt(srt, DEFAULT_CUES)
+    helpers_ns.write_plan_form_a(plan, DEFAULT_PLAN)
+
+    job = make_job(helpers_ns, srt, plan, tmp_path,
+                   source=synth_av, mode="extract")
+    qc1 = helpers_ns.sde.run_job(job, ffmpeg_version)
+    extracted_dir = Path(qc1["extracted_dir"])
+
+    # Plant a stale clip + an unrelated note file
+    (extracted_dir / "clip_998.mp4").write_bytes(b"stale")
+    (extracted_dir / "notes.txt").write_text("user notes", encoding="utf-8")
+
+    qc2 = helpers_ns.sde.run_job(job, ffmpeg_version)
+    files = sorted(p.name for p in Path(qc2["extracted_dir"]).iterdir())
+    assert "clip_998.mp4" not in files, "stale clip should have been removed"
+    assert "notes.txt" in files, "non-clip user files must be preserved"
+
+
+def test_gap_inserted_in_output(helpers_ns, ffmpeg_version, synth_av, tmp_path):
+    srt = tmp_path / "script.srt"
+    plan = tmp_path / "plan.json"
+    # 2 cues with a 1.5s gap between them: total output = 2 + 1.5 + 2.5 = 6.0s
+    cues = [
+        (1, 0.0, 2.0, "first"),
+        (2, 3.5, 6.0, "second after gap"),
+    ]
+    helpers_ns.write_srt(srt, cues)
+    helpers_ns.write_plan_form_a(plan, [(1, 1.0, 3.0), (2, 5.0, 7.5)])
+
+    job = make_job(helpers_ns, srt, plan, tmp_path, source=synth_av)
+    qc = helpers_ns.sde.run_job(job, ffmpeg_version)
+
+    assert qc["ok"] is True
+    assert qc["duration"]["expected_s"] == 6.0
+    assert abs(qc["duration"]["drift_ms"]) <= 200
+    # ffprobe the actual output to double-check
+    actual = helpers_ns.sde.probe_duration(tmp_path / "out.mp4")
+    assert abs(actual - 6.0) < 0.25, f"actual {actual}s, expected 6.0s"
diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py
new file mode 100644
index 0000000..8ce55cd
--- /dev/null
+++ b/tests/test_transcribe.py
@@ -0,0 +1,126 @@
+"""Unit tests for transcribe.py — only the pure conversion logic.
+
+API calls require a live DashScope key and external network; those are
+intentionally out of scope here. Run an end-to-end smoke manually:
+
+    python helpers/transcribe.py path/to/clip.mp4 --language zh
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import pytest
+
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT / "helpers"))
+
+
+@pytest.fixture
+def transcribe_mod():
+    import transcribe as t
+    return t
+
+
+def test_convert_basic_sentence(transcribe_mod):
+    """One sentence with two words gets flattened into Scribe-shaped words[]."""
+    sentences = [
+        {
+            "begin_time": 0,
+            "end_time": 1500,
+            "text": "你好世界",
+            "words": [
+                {"begin_time": 0,   "end_time": 500,  "text": "你好",   "punctuation": ""},
+                {"begin_time": 500, "end_time": 1500, "text": "世界",   "punctuation": "。"},
+            ],
+        }
+    ]
+    out = transcribe_mod._convert_dashscope_to_scribe(sentences, language_hint="zh")
+    assert out["language_code"] == "zh"
+    assert out["_source"].startswith("dashscope-")
+    assert len(out["words"]) == 2
+    assert out["words"][0] == {"text": "你好", "start": 0.0, "end": 0.5, "type": "word"}
+    # Punctuation gets folded into the preceding word's text
+    assert out["words"][1] == {"text": "世界。", "start": 0.5, "end": 1.5, "type": "word"}
+
+
+def test_convert_drops_empty_text(transcribe_mod):
+    """Whitespace-only / empty word entries are skipped, not emitted as junk."""
+    sentences = [
+        {"words": [
+            {"begin_time": 0,    "end_time": 100, "text": ""},
+            {"begin_time": 100,  "end_time": 200, "text": "   "},
+            {"begin_time": 200,  "end_time": 400, "text": "hello"},
+        ]}
+    ]
+    out = transcribe_mod._convert_dashscope_to_scribe(sentences, language_hint=None)
+    assert len(out["words"]) == 1
+    assert out["words"][0]["text"] == "hello"
+    # No language hint → "auto"
+    assert out["language_code"] == "auto"
+
+
+def test_convert_multiple_sentences(transcribe_mod):
+    """Words from multiple sentences flatten into a single ordered list."""
+    sentences = [
+        {"words": [
+            {"begin_time": 0,    "end_time": 500,  "text": "first"},
+        ]},
+        {"words": [
+            {"begin_time": 1000, "end_time": 1500, "text": "second"},
+            {"begin_time": 1500, "end_time": 2000, "text": "third"},
+        ]},
+    ]
+    out = transcribe_mod._convert_dashscope_to_scribe(sentences, language_hint="en")
+    assert [w["text"] for w in out["words"]] == ["first", "second", "third"]
+    assert out["words"][0]["start"] == 0.0
+    assert out["words"][-1]["end"] == 2.0
+
+
+def test_convert_tolerates_missing_or_bad_timestamps(transcribe_mod):
+    """A word with non-numeric timestamps is skipped rather than crashing
+    the whole conversion."""
+    sentences = [
+        {"words": [
+            {"begin_time": "bad", "end_time": 500, "text": "junk"},
+            {"begin_time": 0,     "end_time": 500, "text": "good"},
+        ]}
+    ]
+    out = transcribe_mod._convert_dashscope_to_scribe(sentences, language_hint=None)
+    assert [w["text"] for w in out["words"]] == ["good"]
+
+
+def test_convert_empty_input(transcribe_mod):
+    """Empty / None input returns a structurally valid envelope with no words."""
+    out = transcribe_mod._convert_dashscope_to_scribe([], language_hint=None)
+    assert out["words"] == []
+    assert "language_code" in out and "_source" in out
+
+    out_none = transcribe_mod._convert_dashscope_to_scribe(None, language_hint=None)
+    assert out_none["words"] == []
+
+
+def test_output_shape_compatible_with_recommender(transcribe_mod, tmp_path):
+    """Conversion produces JSON that recommend_edit_plan.load_transcript_words
+    can consume directly — this is the cross-module contract we promise."""
+    import json
+    import recommend_edit_plan as rec
+
+    sentences = [
+        {"words": [
+            {"begin_time": 1000, "end_time": 1500, "text": "hello", "punctuation": ""},
+            {"begin_time": 1500, "end_time": 2000, "text": "world", "punctuation": "."},
+        ]}
+    ]
+    transcript = transcribe_mod._convert_dashscope_to_scribe(sentences, language_hint="en")
+
+    out_file = tmp_path / "transcript.json"
+    out_file.write_text(json.dumps(transcript, ensure_ascii=False), encoding="utf-8")
+
+    words = rec.load_transcript_words(out_file)
+    assert len(words) == 2
+    assert words[0]["text"] == "hello"
+    assert words[1]["text"] == "world."
+    assert words[0]["start"] == 1.0
+    assert words[1]["end"] == 2.0