browser-use · mckila · May 27, 2026 · cubic-dev-ai · May 27, 2026
diff --git a/SKILL.md b/SKILL.md
@@ -71,7 +71,7 @@ Helpers (`helpers/transcribe.py`, `helpers/render.py`, etc.) live alongside this
 
 ## Helpers
 
-- **`transcribe.py <video>`** — single-file Scribe call. `--num-speakers N` optional. Cached.
+- **`transcribe.py <video>`** — single-file Scribe call. `--num-speakers N` optional. `--prompt-file <path>` optional: vocabulary file (one phrase per line, `#` comments) to bias Scribe `keyterms` toward proper nouns / domain terms (brand names, people, places). Missing file is a warn+skip. Cached.
 - **`transcribe_batch.py <videos_dir>`** — 4-worker parallel transcription. Use for multi-take.
 - **`pack_transcripts.py --edit-dir <dir>`** — `transcripts/*.json` → `takes_packed.md` (phrase-level, break on silence ≥ 0.5s).
 - **`timeline_view.py <video> <start> <end>`** — filmstrip + waveform PNG. On-demand visual drill-down. **Not a scan tool** — use it at decision points, not constantly.

diff --git a/helpers/transcribe.py b/helpers/transcribe.py
@@ -11,6 +11,7 @@
     python helpers/transcribe.py <video_path> --edit-dir /custom/edit
     python helpers/transcribe.py <video_path> --language en
     python helpers/transcribe.py <video_path> --num-speakers 2
+    python helpers/transcribe.py <video_path> --prompt-file vocabulary.txt
 """
 
 from __future__ import annotations
@@ -29,6 +30,11 @@
 
 SCRIBE_URL = "https://api.elevenlabs.io/v1/speech-to-text"
 
+# ElevenLabs Scribe `keyterms` constraints (see API docs).
+SCRIBE_KEYTERM_MAX_CHARS = 50
+SCRIBE_KEYTERM_MAX_WORDS = 5
+SCRIBE_KEYTERMS_MAX_COUNT = 1000
+
 
 def load_api_key() -> str:
     for candidate in [Path(__file__).resolve().parent.parent / ".env", Path(".env")]:
@@ -46,6 +52,60 @@ def load_api_key() -> str:
     return v
 
 
+def load_keyterms(prompt_file: Path | None, verbose: bool = True) -> list[str]:
+    """Load vocabulary-biasing keyterms from a prompt file.
+
+    Parses the file ignoring `#` comments and blank lines. Returns a list of
+    phrases suitable for ElevenLabs Scribe's `keyterms` parameter. The same
+    file format ("one phrase per line, # comments") is also a good fit for
+    OpenAI Whisper's `initial_prompt` (just join with commas).
+
+    Skips silently with a warning if the file is missing — a session whose
+    genre ships no vocabulary.txt should still transcribe successfully.
+
+    Filters terms that exceed Scribe's per-keyterm limits (50 chars, 5 words)
+    and truncates the total list to SCRIBE_KEYTERMS_MAX_COUNT (1000).
+    """
+    if prompt_file is None:
+        return []
+
+    if not prompt_file.exists():
+        if verbose:
+            print(
+                f"  warn: --prompt-file not found ({prompt_file}); transcribing without keyterm bias",
+                flush=True,
+            )
+        return []
+
+    raw_lines = prompt_file.read_text().splitlines()
+    phrases: list[str] = []
+    skipped_oversize = 0
+    for line in raw_lines:
+        stripped = line.strip()
+        if not stripped or stripped.startswith("#"):
+            continue
+        word_count = len(stripped.split())
+        if len(stripped) > SCRIBE_KEYTERM_MAX_CHARS or word_count > SCRIBE_KEYTERM_MAX_WORDS:
+            skipped_oversize += 1
+            continue
+        phrases.append(stripped)
+
+    if len(phrases) > SCRIBE_KEYTERMS_MAX_COUNT:
+        if verbose:
+            print(
+                f"  warn: --prompt-file has {len(phrases)} phrases; truncating to "
+                f"{SCRIBE_KEYTERMS_MAX_COUNT} (Scribe limit)",
+                flush=True,
+            )
+        phrases = phrases[:SCRIBE_KEYTERMS_MAX_COUNT]
+
+    if verbose:
+        suffix = f" (skipped {skipped_oversize} oversize)" if skipped_oversize else ""
+        print(f"  keyterms: {len(phrases)} loaded from {prompt_file.name}{suffix}", flush=True)
+
+    return phrases
+
+
 def extract_audio(video_path: Path, dest: Path) -> None:
     cmd = [
         "ffmpeg", "-y", "-i", str(video_path),
@@ -60,6 +120,7 @@ def call_scribe(
     api_key: str,
     language: str | None = None,
     num_speakers: int | None = None,
+    keyterms: list[str] | None = None,
 ) -> dict:
     data: dict[str, str] = {
         "model_id": "scribe_v1",
@@ -71,6 +132,10 @@ def call_scribe(
         data["language_code"] = language
     if num_speakers:
         data["num_speakers"] = str(num_speakers)
+    if keyterms:
+        # Scribe accepts `keyterms` as a JSON-encoded array of strings when
+        # sent over multipart/form-data.
+        data["keyterms"] = json.dumps(keyterms)
 
     with open(audio_path, "rb") as f:
         resp = requests.post(
@@ -93,6 +158,7 @@ def transcribe_one(
     api_key: str,
     language: str | None = None,
     num_speakers: int | None = None,
+    keyterms: list[str] | None = None,
     verbose: bool = True,
 ) -> Path:
     """Transcribe a single video. Returns path to transcript JSON.
@@ -118,7 +184,7 @@ def transcribe_one(
         size_mb = audio.stat().st_size / (1024 * 1024)
         if verbose:
             print(f"  uploading {video.stem}.wav ({size_mb:.1f} MB)", flush=True)
-        payload = call_scribe(audio, api_key, language, num_speakers)
+        payload = call_scribe(audio, api_key, language, num_speakers, keyterms)
 
     out_path.write_text(json.dumps(payload, indent=2))
     dt = time.time() - t0
@@ -153,6 +219,19 @@ def main() -> None:
         default=None,
         help="Optional number of speakers when known. Improves diarization accuracy.",
     )
+    ap.add_argument(
+        "--prompt-file",
+        type=Path,
+        default=None,
+        help=(
+            "Optional vocabulary file (one phrase per line, '#' for comments). "
+            "Phrases are passed as ElevenLabs Scribe `keyterms` to bias "
+            "transcription toward proper nouns / domain terms (e.g. brand names, "
+            "people, places). Missing file is a warn+skip, not an error. "
+            "Equivalent semantics to Whisper's --initial-prompt; the same file "
+            "format works for both engines."
+        ),
+    )
     args = ap.parse_args()
 
     video = args.video.resolve()
@@ -161,13 +240,15 @@ def main() -> None:
 
     edit_dir = (args.edit_dir or (video.parent / "edit")).resolve()
     api_key = load_api_key()
+    keyterms = load_keyterms(args.prompt_file)
 
     transcribe_one(
         video=video,
         edit_dir=edit_dir,
         api_key=api_key,
         language=args.language,
         num_speakers=args.num_speakers,
+        keyterms=keyterms,
     )
 
 

diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,14 @@
+"""Shared test fixtures.
+
+Adds the repo root to sys.path so `helpers.transcribe` is importable without
+having to install the package.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
diff --git a/tests/test_transcribe_prompt_file.py b/tests/test_transcribe_prompt_file.py
@@ -0,0 +1,205 @@
+"""Tests for `--prompt-file` keyterm biasing in helpers/transcribe.py.
+
+The flag exposes ElevenLabs Scribe's `keyterms` array — a vocabulary prior
+that improves recognition of proper nouns and domain terms (brand names,
+people, places). The same prompt-file format works for OpenAI Whisper's
+`initial_prompt` (just join the lines with commas), so the CLI flag is
+engine-agnostic.
+
+Background — the IUIC `show MMC` incident (2026-05-26): on clean speaker
+audio, the canonical IUIC salutation `"shalom Most High in Christ Bless"`
+transcribed as `"show MMC in Christ Bless"`. A pure post-process fuzzy
+correction can't recover `"Most High" -> "MMC"` (no character overlap), so
+the fix has to happen at the engine's input. This flag is that input.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from helpers import transcribe
+
+
+# ----------------------------- parser tests -----------------------------
+
+
+def test_load_keyterms_returns_empty_when_no_file():
+    assert transcribe.load_keyterms(None, verbose=False) == []
+
+
+def test_load_keyterms_strips_comments_and_blanks(tmp_path: Path):
+    f = tmp_path / "vocab.txt"
+    f.write_text(
+        "# IUIC vocabulary\n"
+        "Most High\n"
+        "\n"
+        "  # indented comment\n"
+        "shalom\n"
+        "\n"
+        "Israel United in Christ\n"
+    )
+    assert transcribe.load_keyterms(f, verbose=False) == [
+        "Most High",
+        "shalom",
+        "Israel United in Christ",
+    ]
+
+
+def test_load_keyterms_three_term_fixture_round_trip(tmp_path: Path):
+    """test_prompt_file_loads_and_passes_to_whisper — fixture with 3 terms,
+    mock the engine call, assert all 3 terms travel through to the keyterms
+    parameter."""
+    fixture = tmp_path / "vocabulary.txt"
+    fixture.write_text(
+        "# fixture\n"
+        "Yahawah\n"
+        "Yahawashi\n"
+        "Israelites United in Christ\n"
+    )
+    audio = tmp_path / "clip.wav"
+    audio.write_bytes(b"RIFF0000WAVEfmt ")
+
+    captured: dict = {}
+
+    fake_resp = MagicMock()
+    fake_resp.status_code = 200
+    fake_resp.json.return_value = {"text": "ok", "words": []}
+
+    def fake_post(url, headers, files, data, timeout):  # noqa: ARG001
+        captured.update(data)
+        return fake_resp
+
+    keyterms = transcribe.load_keyterms(fixture, verbose=False)
+    with patch.object(transcribe.requests, "post", side_effect=fake_post):
+        transcribe.call_scribe(
+            audio_path=audio,
+            api_key="test-key",
+            keyterms=keyterms,
+        )
+
+    assert "keyterms" in captured, "keyterms must be sent to Scribe"
+    sent = json.loads(captured["keyterms"])
+    assert sent == ["Yahawah", "Yahawashi", "Israelites United in Christ"]
+
+
+def test_load_keyterms_missing_file_is_clean_skip(tmp_path: Path, capsys):
+    """test_prompt_file_missing_is_clean_skip — nonexistent file must warn
+    and return [], so a session with no vocabulary.txt still transcribes."""
+    missing = tmp_path / "does-not-exist.txt"
+    result = transcribe.load_keyterms(missing, verbose=True)
+    assert result == []
+    captured = capsys.readouterr()
+    assert "warn" in captured.out.lower()
+    assert str(missing) in captured.out
+
+
+def test_load_keyterms_filters_oversize_phrases(tmp_path: Path):
+    """Scribe rejects keyterms >50 chars or >5 words; we filter, don't fail."""
+    f = tmp_path / "v.txt"
+    f.write_text(
+        "Most High\n"
+        f"{'x' * 60}\n"  # too long
+        "one two three four five six\n"  # too many words
+        "shalom family\n"
+    )
+    assert transcribe.load_keyterms(f, verbose=False) == ["Most High", "shalom family"]
+
+
+def test_load_keyterms_truncates_to_scribe_limit(tmp_path: Path):
+    f = tmp_path / "v.txt"
+    f.write_text("\n".join(f"term{i}" for i in range(1500)))
+    out = transcribe.load_keyterms(f, verbose=False)
+    assert len(out) == transcribe.SCRIBE_KEYTERMS_MAX_COUNT
+
+
+def test_call_scribe_omits_keyterms_when_empty(tmp_path: Path):
+    """Empty keyterms => no `keyterms` field, no 20% surcharge."""
+    audio = tmp_path / "clip.wav"
+    audio.write_bytes(b"RIFF0000WAVEfmt ")
+    captured: dict = {}
+
+    fake_resp = MagicMock()
+    fake_resp.status_code = 200
+    fake_resp.json.return_value = {"text": "ok"}
+
+    def fake_post(url, headers, files, data, timeout):  # noqa: ARG001
+        captured.update(data)
+        return fake_resp
+
+    with patch.object(transcribe.requests, "post", side_effect=fake_post):
+        transcribe.call_scribe(audio_path=audio, api_key="k", keyterms=[])
+
+    assert "keyterms" not in captured
+
+
+# ----------------------------- CLI surface -----------------------------
+
+
+def test_help_text_documents_prompt_file_flag():
+    """--help must surface the new flag so operators discover it."""
+    repo_root = Path(__file__).resolve().parent.parent
+    proc = subprocess.run(
+        [sys.executable, str(repo_root / "helpers" / "transcribe.py"), "--help"],
+        capture_output=True,
+        text=True,
+        timeout=15,
+    )
+    assert proc.returncode == 0, proc.stderr
+    assert "--prompt-file" in proc.stdout
+    assert "vocabulary" in proc.stdout.lower()
+
+
+# --------------------- real-execution slice (gated) ---------------------
+
+
+@pytest.mark.skipif(
+    os.environ.get("IUIC_RUN_REAL_SCRIBE") != "1",
+    reason="set IUIC_RUN_REAL_SCRIBE=1 to exercise the live ElevenLabs Scribe API",
+)
+def test_keyterms_recover_iuic_phrase_on_real_scribe(tmp_path: Path):
+    """Real-execution slice. Synthesizes a TTS clip saying the canonical IUIC
+    salutation and verifies that with --prompt-file biasing, Scribe lands on
+    a vocabulary-canonical transcript more reliably than without it.
+
+    Gated on IUIC_RUN_REAL_SCRIBE=1 + ELEVENLABS_API_KEY in env.
+    Requires `say` (macOS) or skips if no TTS binary is available.
+    """
+    import shutil
+
+    if not shutil.which("say"):
+        pytest.skip("macOS `say` not available; cannot synthesize test audio")
+    if not shutil.which("ffmpeg"):
+        pytest.skip("ffmpeg required to convert TTS output")
+    if not os.environ.get("ELEVENLABS_API_KEY"):
+        pytest.skip("ELEVENLABS_API_KEY required for real Scribe call")
+
+    phrase = "shalom Most High in Christ bless"
+    aiff = tmp_path / "phrase.aiff"
+    wav = tmp_path / "phrase.wav"
+    subprocess.run(["say", "-o", str(aiff), phrase], check=True)
+    subprocess.run(
+        ["ffmpeg", "-y", "-i", str(aiff), "-ac", "1", "-ar", "16000", str(wav)],
+        check=True,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+
+    vocab = tmp_path / "vocab.txt"
+    vocab.write_text("Most High\nshalom\nin Christ bless\n")
+    keyterms = transcribe.load_keyterms(vocab, verbose=False)
+    api_key = transcribe.load_api_key()
+
+    biased = transcribe.call_scribe(wav, api_key, keyterms=keyterms)
+    biased_text = biased.get("text", "").lower()
+
+    # The canonical phrase has to show up; we don't compare against an
+    # un-biased call (Scribe is non-deterministic enough that a head-to-head
+    # in a single CI run is noisy). The presence assertion is the regression.
+    assert "most high" in biased_text or "shalom" in biased_text, biased