Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ Helpers (`helpers/transcribe.py`, `helpers/render.py`, etc.) live alongside this

## Helpers

- **`transcribe.py <video>`** — single-file Scribe call. `--num-speakers N` optional. Cached.
- **`transcribe.py <video>`** — single-file Scribe call. `--num-speakers N` optional. `--prompt-file <path>` optional: vocabulary file (one phrase per line, `#` comments) to bias Scribe `keyterms` toward proper nouns / domain terms (brand names, people, places). Missing file is a warn+skip. Cached.
- **`transcribe_batch.py <videos_dir>`** — 4-worker parallel transcription. Use for multi-take.
- **`pack_transcripts.py --edit-dir <dir>`** — `transcripts/*.json` → `takes_packed.md` (phrase-level, break on silence ≥ 0.5s).
- **`timeline_view.py <video> <start> <end>`** — filmstrip + waveform PNG. On-demand visual drill-down. **Not a scan tool** — use it at decision points, not constantly.
Expand Down
83 changes: 82 additions & 1 deletion helpers/transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
python helpers/transcribe.py <video_path> --edit-dir /custom/edit
python helpers/transcribe.py <video_path> --language en
python helpers/transcribe.py <video_path> --num-speakers 2
python helpers/transcribe.py <video_path> --prompt-file vocabulary.txt
"""

from __future__ import annotations
Expand All @@ -29,6 +30,11 @@

SCRIBE_URL = "https://api.elevenlabs.io/v1/speech-to-text"

# ElevenLabs Scribe `keyterms` constraints (see API docs).
SCRIBE_KEYTERM_MAX_CHARS = 50
SCRIBE_KEYTERM_MAX_WORDS = 5
SCRIBE_KEYTERMS_MAX_COUNT = 1000


def load_api_key() -> str:
for candidate in [Path(__file__).resolve().parent.parent / ".env", Path(".env")]:
Expand All @@ -46,6 +52,60 @@ def load_api_key() -> str:
return v


def load_keyterms(prompt_file: Path | None, verbose: bool = True) -> list[str]:
"""Load vocabulary-biasing keyterms from a prompt file.

Parses the file ignoring `#` comments and blank lines. Returns a list of
phrases suitable for ElevenLabs Scribe's `keyterms` parameter. The same
file format ("one phrase per line, # comments") is also a good fit for
OpenAI Whisper's `initial_prompt` (just join with commas).

Skips silently with a warning if the file is missing — a session whose
genre ships no vocabulary.txt should still transcribe successfully.

Filters terms that exceed Scribe's per-keyterm limits (50 chars, 5 words)
and truncates the total list to SCRIBE_KEYTERMS_MAX_COUNT (1000).
"""
if prompt_file is None:
return []

if not prompt_file.exists():
if verbose:
print(
f" warn: --prompt-file not found ({prompt_file}); transcribing without keyterm bias",
flush=True,
)
return []

raw_lines = prompt_file.read_text().splitlines()
phrases: list[str] = []
skipped_oversize = 0
for line in raw_lines:
stripped = line.strip()
if not stripped or stripped.startswith("#"):
continue
word_count = len(stripped.split())
if len(stripped) > SCRIBE_KEYTERM_MAX_CHARS or word_count > SCRIBE_KEYTERM_MAX_WORDS:
skipped_oversize += 1
continue
phrases.append(stripped)

if len(phrases) > SCRIBE_KEYTERMS_MAX_COUNT:
if verbose:
print(
f" warn: --prompt-file has {len(phrases)} phrases; truncating to "
f"{SCRIBE_KEYTERMS_MAX_COUNT} (Scribe limit)",
flush=True,
)
phrases = phrases[:SCRIBE_KEYTERMS_MAX_COUNT]

if verbose:
suffix = f" (skipped {skipped_oversize} oversize)" if skipped_oversize else ""
print(f" keyterms: {len(phrases)} loaded from {prompt_file.name}{suffix}", flush=True)

return phrases


def extract_audio(video_path: Path, dest: Path) -> None:
cmd = [
"ffmpeg", "-y", "-i", str(video_path),
Expand All @@ -60,6 +120,7 @@ def call_scribe(
api_key: str,
language: str | None = None,
num_speakers: int | None = None,
keyterms: list[str] | None = None,
) -> dict:
data: dict[str, str] = {
"model_id": "scribe_v1",
Expand All @@ -71,6 +132,10 @@ def call_scribe(
data["language_code"] = language
if num_speakers:
data["num_speakers"] = str(num_speakers)
if keyterms:
# Scribe accepts `keyterms` as a JSON-encoded array of strings when
# sent over multipart/form-data.
data["keyterms"] = json.dumps(keyterms)

with open(audio_path, "rb") as f:
resp = requests.post(
Expand All @@ -93,6 +158,7 @@ def transcribe_one(
api_key: str,
language: str | None = None,
num_speakers: int | None = None,
keyterms: list[str] | None = None,
verbose: bool = True,
) -> Path:
"""Transcribe a single video. Returns path to transcript JSON.
Expand All @@ -118,7 +184,7 @@ def transcribe_one(
size_mb = audio.stat().st_size / (1024 * 1024)
if verbose:
print(f" uploading {video.stem}.wav ({size_mb:.1f} MB)", flush=True)
payload = call_scribe(audio, api_key, language, num_speakers)
payload = call_scribe(audio, api_key, language, num_speakers, keyterms)

out_path.write_text(json.dumps(payload, indent=2))
dt = time.time() - t0
Expand Down Expand Up @@ -153,6 +219,19 @@ def main() -> None:
default=None,
help="Optional number of speakers when known. Improves diarization accuracy.",
)
ap.add_argument(
"--prompt-file",
type=Path,
default=None,
help=(
"Optional vocabulary file (one phrase per line, '#' for comments). "
"Phrases are passed as ElevenLabs Scribe `keyterms` to bias "
"transcription toward proper nouns / domain terms (e.g. brand names, "
"people, places). Missing file is a warn+skip, not an error. "
"Equivalent semantics to Whisper's --initial-prompt; the same file "
"format works for both engines."
),
)
args = ap.parse_args()

video = args.video.resolve()
Expand All @@ -161,13 +240,15 @@ def main() -> None:

edit_dir = (args.edit_dir or (video.parent / "edit")).resolve()
api_key = load_api_key()
keyterms = load_keyterms(args.prompt_file)

transcribe_one(
video=video,
edit_dir=edit_dir,
api_key=api_key,
language=args.language,
num_speakers=args.num_speakers,
keyterms=keyterms,
)


Expand Down
Empty file added tests/__init__.py
Empty file.
14 changes: 14 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""Shared test fixtures.

Adds the repo root to sys.path so `helpers.transcribe` is importable without
having to install the package.
"""

from __future__ import annotations

import sys
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parent.parent
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
205 changes: 205 additions & 0 deletions tests/test_transcribe_prompt_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
"""Tests for `--prompt-file` keyterm biasing in helpers/transcribe.py.

The flag exposes ElevenLabs Scribe's `keyterms` array — a vocabulary prior
that improves recognition of proper nouns and domain terms (brand names,
people, places). The same prompt-file format works for OpenAI Whisper's
`initial_prompt` (just join the lines with commas), so the CLI flag is
engine-agnostic.

Background — the IUIC `show MMC` incident (2026-05-26): on clean speaker
audio, the canonical IUIC salutation `"shalom Most High in Christ Bless"`
transcribed as `"show MMC in Christ Bless"`. A pure post-process fuzzy
correction can't recover `"Most High" -> "MMC"` (no character overlap), so
the fix has to happen at the engine's input. This flag is that input.
"""

from __future__ import annotations

import json
import os
import subprocess
import sys
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

from helpers import transcribe


# ----------------------------- parser tests -----------------------------


def test_load_keyterms_returns_empty_when_no_file():
assert transcribe.load_keyterms(None, verbose=False) == []


def test_load_keyterms_strips_comments_and_blanks(tmp_path: Path):
f = tmp_path / "vocab.txt"
f.write_text(
"# IUIC vocabulary\n"
"Most High\n"
"\n"
" # indented comment\n"
"shalom\n"
"\n"
"Israel United in Christ\n"
)
assert transcribe.load_keyterms(f, verbose=False) == [
"Most High",
"shalom",
"Israel United in Christ",
]


def test_load_keyterms_three_term_fixture_round_trip(tmp_path: Path):
"""test_prompt_file_loads_and_passes_to_whisper — fixture with 3 terms,
mock the engine call, assert all 3 terms travel through to the keyterms
parameter."""
fixture = tmp_path / "vocabulary.txt"
fixture.write_text(
"# fixture\n"
"Yahawah\n"
"Yahawashi\n"
"Israelites United in Christ\n"
)
audio = tmp_path / "clip.wav"
audio.write_bytes(b"RIFF0000WAVEfmt ")

captured: dict = {}

fake_resp = MagicMock()
fake_resp.status_code = 200
fake_resp.json.return_value = {"text": "ok", "words": []}

def fake_post(url, headers, files, data, timeout): # noqa: ARG001
captured.update(data)
return fake_resp

keyterms = transcribe.load_keyterms(fixture, verbose=False)
with patch.object(transcribe.requests, "post", side_effect=fake_post):
transcribe.call_scribe(
audio_path=audio,
api_key="test-key",
keyterms=keyterms,
)

assert "keyterms" in captured, "keyterms must be sent to Scribe"
sent = json.loads(captured["keyterms"])
assert sent == ["Yahawah", "Yahawashi", "Israelites United in Christ"]


def test_load_keyterms_missing_file_is_clean_skip(tmp_path: Path, capsys):
"""test_prompt_file_missing_is_clean_skip — nonexistent file must warn
and return [], so a session with no vocabulary.txt still transcribes."""
missing = tmp_path / "does-not-exist.txt"
result = transcribe.load_keyterms(missing, verbose=True)
assert result == []
captured = capsys.readouterr()
assert "warn" in captured.out.lower()
assert str(missing) in captured.out


def test_load_keyterms_filters_oversize_phrases(tmp_path: Path):
"""Scribe rejects keyterms >50 chars or >5 words; we filter, don't fail."""
f = tmp_path / "v.txt"
f.write_text(
"Most High\n"
f"{'x' * 60}\n" # too long
"one two three four five six\n" # too many words
"shalom family\n"
)
assert transcribe.load_keyterms(f, verbose=False) == ["Most High", "shalom family"]


def test_load_keyterms_truncates_to_scribe_limit(tmp_path: Path):
f = tmp_path / "v.txt"
f.write_text("\n".join(f"term{i}" for i in range(1500)))
out = transcribe.load_keyterms(f, verbose=False)
assert len(out) == transcribe.SCRIBE_KEYTERMS_MAX_COUNT


def test_call_scribe_omits_keyterms_when_empty(tmp_path: Path):
"""Empty keyterms => no `keyterms` field, no 20% surcharge."""
audio = tmp_path / "clip.wav"
audio.write_bytes(b"RIFF0000WAVEfmt ")
captured: dict = {}

fake_resp = MagicMock()
fake_resp.status_code = 200
fake_resp.json.return_value = {"text": "ok"}

def fake_post(url, headers, files, data, timeout): # noqa: ARG001
captured.update(data)
return fake_resp

with patch.object(transcribe.requests, "post", side_effect=fake_post):
transcribe.call_scribe(audio_path=audio, api_key="k", keyterms=[])

assert "keyterms" not in captured


# ----------------------------- CLI surface -----------------------------


def test_help_text_documents_prompt_file_flag():
"""--help must surface the new flag so operators discover it."""
repo_root = Path(__file__).resolve().parent.parent
proc = subprocess.run(
[sys.executable, str(repo_root / "helpers" / "transcribe.py"), "--help"],
capture_output=True,
text=True,
timeout=15,
)
assert proc.returncode == 0, proc.stderr
assert "--prompt-file" in proc.stdout
assert "vocabulary" in proc.stdout.lower()


# --------------------- real-execution slice (gated) ---------------------


@pytest.mark.skipif(
os.environ.get("IUIC_RUN_REAL_SCRIBE") != "1",
reason="set IUIC_RUN_REAL_SCRIBE=1 to exercise the live ElevenLabs Scribe API",
)
def test_keyterms_recover_iuic_phrase_on_real_scribe(tmp_path: Path):
"""Real-execution slice. Synthesizes a TTS clip saying the canonical IUIC
salutation and verifies that with --prompt-file biasing, Scribe lands on
a vocabulary-canonical transcript more reliably than without it.

Gated on IUIC_RUN_REAL_SCRIBE=1 + ELEVENLABS_API_KEY in env.
Requires `say` (macOS) or skips if no TTS binary is available.
"""
import shutil

if not shutil.which("say"):
pytest.skip("macOS `say` not available; cannot synthesize test audio")
if not shutil.which("ffmpeg"):
pytest.skip("ffmpeg required to convert TTS output")
if not os.environ.get("ELEVENLABS_API_KEY"):
pytest.skip("ELEVENLABS_API_KEY required for real Scribe call")

phrase = "shalom Most High in Christ bless"
aiff = tmp_path / "phrase.aiff"
wav = tmp_path / "phrase.wav"
subprocess.run(["say", "-o", str(aiff), phrase], check=True)
subprocess.run(
["ffmpeg", "-y", "-i", str(aiff), "-ac", "1", "-ar", "16000", str(wav)],
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)

vocab = tmp_path / "vocab.txt"
vocab.write_text("Most High\nshalom\nin Christ bless\n")
keyterms = transcribe.load_keyterms(vocab, verbose=False)
api_key = transcribe.load_api_key()

biased = transcribe.call_scribe(wav, api_key, keyterms=keyterms)
biased_text = biased.get("text", "").lower()

# The canonical phrase has to show up; we don't compare against an
# un-biased call (Scribe is non-deterministic enough that a head-to-head
# in a single CI run is noisy). The presence assertion is the regression.
assert "most high" in biased_text or "shalom" in biased_text, biased

@cubic-dev-ai cubic-dev-ai Bot May 27, 2026

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: The live Scribe regression test is too permissive: it can pass without validating recovery of the biased keyterm.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At tests/test_transcribe_prompt_file.py, line 205:

<comment>The live Scribe regression test is too permissive: it can pass without validating recovery of the biased keyterm.</comment>

<file context>
@@ -0,0 +1,205 @@
+    # The canonical phrase has to show up; we don't compare against an
+    # un-biased call (Scribe is non-deterministic enough that a head-to-head
+    # in a single CI run is noisy). The presence assertion is the regression.
+    assert "most high" in biased_text or "shalom" in biased_text, biased
</file context>
Fix with Cubic