diff --git a/.env.example b/.env.example index 52bb86d..8414808 100644 --- a/.env.example +++ b/.env.example @@ -8,9 +8,20 @@ TRANSCRIBER_MODEL=whisper-large-v3-turbo # groq:openai/gpt-oss-120b (cloud, default) # groq:llama-3.3-70b-versatile (cloud, stable) # openai:gpt-4o-mini (cloud, reliable) +# openai:bilingualsub-gemini-flash (Docker Compose via CLIProxyAPI) # ollama:TwinkleAI/gemma-3-4B-T1-it (local, free) TRANSLATOR_MODEL=groq:openai/gpt-oss-120b +# === Optional CLIProxyAPI (used only by docker-compose.yml) === +# Needed only if you want translations to route through CLIProxyAPI/agy. +# 1. Run host OAuth login first: cliproxyapi -antigravity-login +# 2. docker-compose.yml mounts this auth directory into the cli-proxy container. +# Leave CLIPROXY_AUTH_DIR unset to use ${HOME}/.cli-proxy-api. +# Set an absolute path only if your CLIProxyAPI auth directory is elsewhere. +# CLIPROXY_AUTH_DIR=/Users/you/.cli-proxy-api +CLIPROXY_PORT=8317 +BILINGUALSUB_PORT=7860 + # === API Keys (only needed for cloud providers) === GROQ_API_KEY= OPENAI_API_KEY= diff --git a/.gitignore b/.gitignore index 781f01e..260b9e6 100644 --- a/.gitignore +++ b/.gitignore @@ -91,6 +91,10 @@ frontend/.vite/ # Logs *.log logs/ +cliproxy-logs/ + +# CLIProxyAPI OAuth tokens +.cli-proxy-api/ # Temporary tmp/ diff --git a/.prettierignore b/.prettierignore index 5719d80..2944443 100644 --- a/.prettierignore +++ b/.prettierignore @@ -15,6 +15,10 @@ __pycache__/ .venv/ venv/ htmlcov/ +.mypy_cache/ +.pytest_cache/ +.ruff_cache/ +coverage.xml # IDE .idea/ @@ -24,3 +28,5 @@ htmlcov/ *.min.js *.min.css coverage/ +.playwright-mcp/ +preview-*.html diff --git a/.secrets.baseline b/.secrets.baseline index 5d1a66b..2626389 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -139,7 +139,7 @@ "filename": "tests/unit/core/test_transcriber.py", "hashed_secret": "2e7a7ee14caebf378fc32d6cf6f557f347c96773", "is_verified": false, - "line_number": 78 + "line_number": 82 } ], "tests/unit/utils/test_config.py": [ @@ -166,5 +166,5 @@ } ] }, - "generated_at": "2026-02-11T14:21:40Z" + "generated_at": "2026-06-26T09:21:31Z" } diff --git a/README.md b/README.md index c2f7f9c..7c323de 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,56 @@ docker build -t bilingualsub . && docker run -p 7860:7860 -e GROQ_API_KEY=your_k Then open http://localhost:7860 in your browser. +### Optional: Docker Compose with CLIProxyAPI + +This path is optional. Use it only when you want translations to go through a +local CLIProxyAPI container backed by your own Antigravity/Codex/Claude OAuth +login. The regular Docker flow above does not require CLIProxyAPI. + +For Antigravity/agy, CLIProxyAPI can route requests out of the box once the +host has OAuth credentials. Install CLIProxyAPI on the host and log in. This +creates OAuth token files under `~/.cli-proxy-api`, which are mounted read/write +into the proxy container: + +```bash +cliproxyapi -antigravity-login +``` + +Create a local `.env` from the example and set at least `GROQ_API_KEY`: + +```bash +cp .env.example .env +``` + +For the compose setup, use an OpenAI-compatible proxy model: + +```env +TRANSLATOR_MODEL=openai:bilingualsub-gemini-flash +# Optional: set only when your auth directory is not ~/.cli-proxy-api +# CLIPROXY_AUTH_DIR=/absolute/path/to/.cli-proxy-api +``` + +Then start both services: + +```bash +docker compose up --build +``` + +BilingualSub runs at http://localhost:7860. It talks to CLIProxyAPI through the +compose network at `http://cli-proxy:8317/v1`, so OAuth tokens are never baked +into the image or committed to the repository. +The proxy port is bound to `127.0.0.1` only; the compose stack uses the fixed +local bearer key `bilingualsub-local` internally. + +The default alias maps to Antigravity's `gemini-3.5-flash-low`, which is the +most consistently discoverable Flash variant in current CLIProxyAPI releases. +If the alias does not exist in your version, list the available proxy models +and set `TRANSLATOR_MODEL=openai:` in `.env`: + +```bash +curl -H "Authorization: Bearer bilingualsub-local" http://localhost:8317/v1/models +``` + ### Local Development **Prerequisites**: Python 3.11+, FFmpeg, Node.js 18+, pnpm @@ -56,6 +106,7 @@ Backend runs at http://localhost:8000, frontend at http://localhost:5173. | `TRANSCRIBER_PROVIDER` | Transcription provider | `groq` | No | | `TRANSCRIBER_MODEL` | Whisper model to use | `whisper-large-v3-turbo` | No | | `TRANSLATOR_MODEL` | LLM model for translation | `groq:openai/gpt-oss-120b` | No | +| `OPENAI_BASE_URL` | OpenAI-compatible proxy URL | - | No | ## Architecture diff --git a/README.zh-TW.md b/README.zh-TW.md index 73489f5..256e340 100644 --- a/README.zh-TW.md +++ b/README.zh-TW.md @@ -27,6 +27,53 @@ docker build -t bilingualsub . && docker run -p 7860:7860 -e GROQ_API_KEY=your_k 然後在瀏覽器開啟 http://localhost:7860。 +### 選用:使用 CLIProxyAPI 的 Docker Compose + +這個流程是選用的。只有當你想讓翻譯走本機 CLIProxyAPI container,並使用 +自己的 Antigravity/Codex/Claude OAuth 登入狀態時才需要。上面的單純 Docker +流程不需要 CLIProxyAPI。 + +對 Antigravity/agy 來說,只要 host 已有 OAuth credentials,CLIProxyAPI 就能 +開箱路由請求。先在 host 安裝 CLIProxyAPI 並登入。OAuth token 會建立在 +`~/.cli-proxy-api`,之後由 compose 掛進 proxy container: + +```bash +cliproxyapi -antigravity-login +``` + +從範例建立本機 `.env`,並至少設定 `GROQ_API_KEY`: + +```bash +cp .env.example .env +``` + +Compose 模式請使用 OpenAI-compatible proxy model: + +```env +TRANSLATOR_MODEL=openai:bilingualsub-gemini-flash +# 選填:只有 auth 目錄不是 ~/.cli-proxy-api 時才需要設定 +# CLIPROXY_AUTH_DIR=/absolute/path/to/.cli-proxy-api +``` + +啟動兩個服務: + +```bash +docker compose up --build +``` + +BilingualSub 會跑在 http://localhost:7860。它會透過 compose network 連到 +`http://cli-proxy:8317/v1`,OAuth token 不會被打包進 image,也不會 commit +到 repo。proxy 對 host 只綁定 `127.0.0.1`;compose stack 內部固定使用本機 +bearer key `bilingualsub-local`。 + +預設 alias 對應 Antigravity 的 `gemini-3.5-flash-low`,這是目前 CLIProxyAPI +版本中較穩定可發現的 Flash 變體。如果你的版本沒有這個 alias,可以列出可用 +模型,並在 `.env` 設定 `TRANSLATOR_MODEL=openai:`: + +```bash +curl -H "Authorization: Bearer bilingualsub-local" http://localhost:8317/v1/models +``` + ### 本地開發 **前置需求**:Python 3.11+、FFmpeg、Node.js 18+、pnpm @@ -56,6 +103,7 @@ cd frontend && pnpm dev | `TRANSCRIBER_PROVIDER` | 語音辨識供應商 | `groq` | 否 | | `TRANSCRIBER_MODEL` | 使用的 Whisper 模型 | `whisper-large-v3-turbo` | 否 | | `TRANSLATOR_MODEL` | 翻譯用的 LLM 模型 | `groq:openai/gpt-oss-120b` | 否 | +| `OPENAI_BASE_URL` | OpenAI-compatible proxy URL | - | 否 | ## 架構說明 diff --git a/cliproxyapi.conf b/cliproxyapi.conf new file mode 100644 index 0000000..3e05a79 --- /dev/null +++ b/cliproxyapi.conf @@ -0,0 +1,25 @@ +# Minimal CLIProxyAPI config for BilingualSub's Docker Compose setup. +# OAuth credentials are discovered from auth-dir, mounted from the host's +# ~/.cli-proxy-api directory by docker-compose.yml. + +host: "" +port: 8317 +auth-dir: "/root/.cli-proxy-api" + +api-keys: + - "bilingualsub-local" + +debug: false +logging-to-file: false +usage-statistics-enabled: false + +# Optional alias used by docker-compose.yml's default TRANSLATOR_MODEL. +# If CLIProxyAPI changes Antigravity upstream model IDs, set TRANSLATOR_MODEL +# in .env to one of the IDs returned by: +# curl -H "Authorization: Bearer bilingualsub-local" http://localhost:8317/v1/models +oauth-model-alias: + antigravity: + - name: "gemini-3.5-flash-low" + alias: "bilingualsub-gemini-flash" + fork: true + force-mapping: true diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..e3feec2 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,27 @@ +services: + cli-proxy: + image: eceasy/cli-proxy-api:latest + restart: unless-stopped + ports: + - '127.0.0.1:${CLIPROXY_PORT:-8317}:8317' + volumes: + - '${CLIPROXY_AUTH_DIR:-${HOME}/.cli-proxy-api}:/root/.cli-proxy-api' + - './cliproxyapi.conf:/CLIProxyAPI/config.yaml:ro' + command: ['./CLIProxyAPI', '-config', '/CLIProxyAPI/config.yaml'] + + bilingualsub: + build: . + image: bilingualsub:latest + restart: unless-stopped + ports: + - '${BILINGUALSUB_PORT:-7860}:7860' + environment: + GROQ_API_KEY: '${GROQ_API_KEY:?Set GROQ_API_KEY in .env or your shell}' + GEMINI_API_KEY: '${GEMINI_API_KEY:-}' + TRANSCRIBER_PROVIDER: '${TRANSCRIBER_PROVIDER:-groq}' + TRANSCRIBER_MODEL: '${TRANSCRIBER_MODEL:-whisper-large-v3-turbo}' + TRANSLATOR_MODEL: '${TRANSLATOR_MODEL:-openai:bilingualsub-gemini-flash}' + OPENAI_BASE_URL: 'http://cli-proxy:8317/v1' + OPENAI_API_KEY: 'bilingualsub-local' # pragma: allowlist secret + depends_on: + - cli-proxy diff --git a/frontend/src/components/SubtitleEditor.test.tsx b/frontend/src/components/SubtitleEditor.test.tsx new file mode 100644 index 0000000..614721e --- /dev/null +++ b/frontend/src/components/SubtitleEditor.test.tsx @@ -0,0 +1,89 @@ +import { fireEvent, render, screen, waitFor } from '@testing-library/react'; +import { SubtitleEditor } from './SubtitleEditor'; + +const apiMocks = vi.hoisted(() => ({ + fetchSrtContent: vi.fn(), + partialRetranslate: vi.fn(), + addGlossaryEntry: vi.fn(), +})); + +const i18nMocks = vi.hoisted(() => ({ + t: (key: string) => key, +})); + +vi.mock('react-i18next', () => ({ + useTranslation: () => ({ + t: i18nMocks.t, + i18n: { language: 'zh-TW', changeLanguage: vi.fn() }, + }), +})); + +vi.mock('@/api/client', () => ({ + apiClient: { + fetchSrtContent: apiMocks.fetchSrtContent, + partialRetranslate: apiMocks.partialRetranslate, + getDownloadUrl: (jobId: string, fileType: string) => `/api/jobs/${jobId}/download/${fileType}`, + addGlossaryEntry: apiMocks.addGlossaryEntry, + }, +})); + +const srtContent = `1 +00:00:01,000 --> 00:00:02,000 +Old translation +old source + +2 +00:00:03,000 --> 00:00:04,000 +Untouched translation +untouched source`; + +function mockTextTrack() { + const cues: TextTrackCue[] = []; + return { + cues, + mode: 'hidden', + addCue: vi.fn((cue: TextTrackCue) => cues.push(cue)), + removeCue: vi.fn((cue: TextTrackCue) => { + const index = cues.indexOf(cue); + if (index >= 0) cues.splice(index, 1); + }), + }; +} + +describe('SubtitleEditor partial retranslate preview', () => { + beforeEach(() => { + vi.clearAllMocks(); + apiMocks.fetchSrtContent.mockResolvedValue(srtContent); + HTMLMediaElement.prototype.addTextTrack = vi.fn(() => mockTextTrack() as unknown as TextTrack); + HTMLMediaElement.prototype.play = vi.fn(); + globalThis.VTTCue = vi.fn(function VTTCue(startTime, endTime, text) { + return { startTime, endTime, text }; + }) as unknown as typeof VTTCue; + }); + + it('previews and applies corrected source text with translated text', async () => { + apiMocks.partialRetranslate.mockResolvedValue({ + results: [{ index: 1, original: 'correct source', translated: 'New translation' }], + }); + + render(); + + await screen.findByDisplayValue('Old translation'); + const retranslateButton = screen.getByRole('button', { name: 'editor.retranslate' }); + fireEvent.click(screen.getAllByTitle('editor.selectForRetranslate')[0]); + await waitFor(() => expect(retranslateButton).toBeEnabled()); + fireEvent.click(retranslateButton); + + await screen.findByText('correct source'); + expect(screen.getAllByText('old source')).toHaveLength(2); + expect(screen.getByText('New translation')).toBeInTheDocument(); + + fireEvent.click(screen.getByText('editor.retranslatePreviewApply')); + + await waitFor(() => { + expect(screen.getByDisplayValue('New translation')).toBeInTheDocument(); + }); + expect(screen.getByText('correct source')).toBeInTheDocument(); + expect(screen.queryByText('old source')).not.toBeInTheDocument(); + }); +}); diff --git a/frontend/src/components/SubtitleEditor.tsx b/frontend/src/components/SubtitleEditor.tsx index 3deb6f5..ba879f1 100644 --- a/frontend/src/components/SubtitleEditor.tsx +++ b/frontend/src/components/SubtitleEditor.tsx @@ -16,9 +16,10 @@ interface SubtitleEditorProps { interface RetranslatePreviewItem { index: number; - original: string; - before: string; - after: string; + originalBefore: string; + originalAfter: string; + translatedBefore: string; + translatedAfter: string; } type RetranslateChoice = 'before' | 'after'; @@ -122,6 +123,7 @@ export function SubtitleEditor({ jobId, onBurn, isBurning }: SubtitleEditorProps return entries.some( (entry, i) => entry.translated !== originalEntries[i].translated || + entry.original !== originalEntries[i].original || entry.startTime !== originalEntries[i].startTime || entry.endTime !== originalEntries[i].endTime ); @@ -234,17 +236,19 @@ export function SubtitleEditor({ jobId, onBurn, isBurning }: SubtitleEditorProps user_context: retranslateContext.trim() || undefined, }); - const translatedMap = new Map( - response.results.map(item => [item.index, item.translated] as const) - ); + const resultMap = new Map(response.results.map(item => [item.index, item] as const)); const previewItems = entries - .filter(entry => translatedMap.has(entry.index)) - .map(entry => ({ - index: entry.index, - original: entry.original, - before: entry.translated, - after: translatedMap.get(entry.index) ?? entry.translated, - })) + .filter(entry => resultMap.has(entry.index)) + .map(entry => { + const result = resultMap.get(entry.index)!; + return { + index: entry.index, + originalBefore: entry.original, + originalAfter: result.original, + translatedBefore: entry.translated, + translatedAfter: result.translated, + }; + }) .sort((a, b) => a.index - b.index); if (previewItems.length === 0) { @@ -274,18 +278,22 @@ export function SubtitleEditor({ jobId, onBurn, isBurning }: SubtitleEditorProps ]); const handleApplyRetranslatePreview = useCallback(() => { - const translatedMap = new Map( + const acceptedMap = new Map( retranslatePreview.map(item => { const choice = retranslateChoices[item.index] ?? 'after'; - return [item.index, choice === 'before' ? item.before : item.after] as const; + return [ + item.index, + choice === 'before' + ? { original: item.originalBefore, translated: item.translatedBefore } + : { original: item.originalAfter, translated: item.translatedAfter }, + ] as const; }) ); setEntries(prev => - prev.map(entry => - translatedMap.has(entry.index) - ? { ...entry, translated: translatedMap.get(entry.index) ?? entry.translated } - : entry - ) + prev.map(entry => { + const accepted = acceptedMap.get(entry.index); + return accepted ? { ...entry, ...accepted } : entry; + }) ); setSelectedIndices(new Set()); setRetranslatePreview([]); @@ -451,7 +459,6 @@ export function SubtitleEditor({ jobId, onBurn, isBurning }: SubtitleEditorProps className="border border-gray-100 rounded-xl p-4" >

#{item.index}

- {item.original &&

{item.original}

}
diff --git a/frontend/src/types.ts b/frontend/src/types.ts index ce6d2e8..5a17160 100644 --- a/frontend/src/types.ts +++ b/frontend/src/types.ts @@ -68,6 +68,7 @@ export interface PartialRetranslateRequest { export interface PartialRetranslateResult { index: number; + original: string; translated: string; } diff --git a/src/bilingualsub/api/pipeline.py b/src/bilingualsub/api/pipeline.py index 42fab1f..6b6e566 100644 --- a/src/bilingualsub/api/pipeline.py +++ b/src/bilingualsub/api/pipeline.py @@ -30,6 +30,7 @@ TranslationError, VideoMetadata, VisualDescriptionError, + build_whisper_prompt, describe_video, download_video, merge_subtitles, @@ -491,8 +492,12 @@ async def run_subtitle(job: Job) -> None: job, JobStatus.TRANSCRIBING, 20.0, "transcribe", "Transcribing audio" ) t0 = time.monotonic() + whisper_prompt = build_whisper_prompt(video_title=job.video_title) original_sub = await asyncio.to_thread( - transcribe_audio, audio_path, language=job.source_lang + transcribe_audio, + audio_path, + language=job.source_lang, + prompt=whisper_prompt, ) log.info( "step_done", diff --git a/src/bilingualsub/api/routes.py b/src/bilingualsub/api/routes.py index 09c0306..c16f982 100644 --- a/src/bilingualsub/api/routes.py +++ b/src/bilingualsub/api/routes.py @@ -377,8 +377,12 @@ async def partial_retranslate( return PartialRetranslateResponse( results=[ - PartialRetranslateItem(index=index, translated=translated) - for index, translated in sorted(results.items()) + PartialRetranslateItem( + index=index, + original=result.original, + translated=result.translated, + ) + for index, result in sorted(results.items()) ] ) diff --git a/src/bilingualsub/api/schemas.py b/src/bilingualsub/api/schemas.py index 06fa1dd..e61a4e6 100644 --- a/src/bilingualsub/api/schemas.py +++ b/src/bilingualsub/api/schemas.py @@ -108,6 +108,7 @@ class PartialRetranslateItem(BaseModel): """Single re-translated item.""" index: int + original: str translated: str diff --git a/src/bilingualsub/core/__init__.py b/src/bilingualsub/core/__init__.py index e3d2e7b..bf15e4d 100644 --- a/src/bilingualsub/core/__init__.py +++ b/src/bilingualsub/core/__init__.py @@ -9,9 +9,14 @@ from bilingualsub.core.merger import merge_subtitles from bilingualsub.core.subtitle import Subtitle, SubtitleEntry from bilingualsub.core.subtitle_fetcher import SubtitleFetchError, fetch_manual_subtitle -from bilingualsub.core.transcriber import TranscriptionError, transcribe_audio +from bilingualsub.core.transcriber import ( + TranscriptionError, + build_whisper_prompt, + transcribe_audio, +) from bilingualsub.core.translator import ( RetranslateEntry, + RetranslateResult, TranslationError, retranslate_entries, translate_subtitle, @@ -27,6 +32,7 @@ "GlossaryError", "GlossaryManager", "RetranslateEntry", + "RetranslateResult", "Subtitle", "SubtitleEntry", "SubtitleFetchError", @@ -34,6 +40,7 @@ "TranslationError", "VideoMetadata", "VisualDescriptionError", + "build_whisper_prompt", "describe_video", "download_video", "fetch_manual_subtitle", diff --git a/src/bilingualsub/core/transcriber.py b/src/bilingualsub/core/transcriber.py index 6c0a658..e5f9701 100644 --- a/src/bilingualsub/core/transcriber.py +++ b/src/bilingualsub/core/transcriber.py @@ -11,18 +11,42 @@ from bilingualsub.utils.config import get_groq_api_key, get_openai_api_key, get_settings from bilingualsub.utils.ffmpeg import split_audio +_MAX_WHISPER_PROMPT_CHARS = 800 + class TranscriptionError(Exception): """Raised when audio transcription fails.""" -def _transcribe_single(audio_path: Path, *, language: str, settings: Any) -> Subtitle: +def build_whisper_prompt( + video_title: str = "", +) -> str | None: + """Build a concise Whisper hint from the video title. + + Args: + video_title: Raw video title string. + + Returns: + Cleaned title string, or None if the title is blank. + """ + title = video_title.strip() + if not title: + return None + if len(title) > _MAX_WHISPER_PROMPT_CHARS: + return title[:_MAX_WHISPER_PROMPT_CHARS] + return title + + +def _transcribe_single( + audio_path: Path, *, language: str, settings: Any, prompt: str | None = None +) -> Subtitle: """Transcribe a single audio file (must be <= 25MB). Args: audio_path: Path to audio file language: ISO 639-1 language code settings: Application settings + prompt: Optional hint text to guide transcription accuracy Returns: Subtitle object with transcribed entries @@ -45,12 +69,15 @@ def _transcribe_single(audio_path: Path, *, language: str, settings: Any) -> Sub try: with audio_path.open("rb") as audio_file: - transcription = client.audio.transcriptions.create( - file=(audio_path.name, audio_file), - model=settings.transcriber_model, - response_format="verbose_json", - language=language, - ) + create_kwargs: dict[str, Any] = { + "file": (audio_path.name, audio_file), + "model": settings.transcriber_model, + "response_format": "verbose_json", + "language": language, + } + if prompt: + create_kwargs["prompt"] = prompt + transcription = client.audio.transcriptions.create(**create_kwargs) except Exception as e: raise TranscriptionError(f"Failed to transcribe audio: {e}") from e @@ -59,15 +86,21 @@ def _transcribe_single(audio_path: Path, *, language: str, settings: Any) -> Sub if not segments: raise TranscriptionError("Transcription returned no segments") - entries = [] - for i, seg in enumerate(segments, start=1): - entry = SubtitleEntry( + entries = [ + SubtitleEntry( index=i, start=timedelta(seconds=seg["start"]), end=timedelta(seconds=seg["end"]), text=seg["text"].strip(), ) - entries.append(entry) + for i, seg in enumerate( + (s for s in segments if s["start"] < s["end"] and s["text"].strip()), + start=1, + ) + ] + + if not entries: + raise TranscriptionError("No valid segments after filtering") return Subtitle(entries=entries) except TranscriptionError: @@ -76,7 +109,9 @@ def _transcribe_single(audio_path: Path, *, language: str, settings: Any) -> Sub raise TranscriptionError(f"Failed to parse transcription result: {e}") from e -def transcribe_audio(audio_path: Path, *, language: str = "en") -> Subtitle: +def transcribe_audio( + audio_path: Path, *, language: str = "en", prompt: str | None = None +) -> Subtitle: """ Transcribe audio file to subtitle using Whisper API. @@ -85,6 +120,8 @@ def transcribe_audio(audio_path: Path, *, language: str = "en") -> Subtitle: Args: audio_path: Path to audio/video file language: ISO 639-1 language code (e.g., "en", "zh", "ja") + prompt: Optional hint text (e.g., from build_whisper_prompt) to improve + proper noun recognition Returns: Subtitle object with transcribed entries @@ -99,12 +136,14 @@ def transcribe_audio(audio_path: Path, *, language: str = "en") -> Subtitle: if not audio_path.is_file(): raise ValueError(f"Audio path is not a file: {audio_path}") - language = language.split("-")[0] + language = language.split("-", maxsplit=1)[0] settings = get_settings() file_size_mb = audio_path.stat().st_size / (1024 * 1024) if file_size_mb <= 25: - return _transcribe_single(audio_path, language=language, settings=settings) + return _transcribe_single( + audio_path, language=language, settings=settings, prompt=prompt + ) # Large file: split into chunks and transcribe each @@ -112,7 +151,9 @@ def transcribe_audio(audio_path: Path, *, language: str = "en") -> Subtitle: all_entries: list[SubtitleEntry] = [] idx = 1 for chunk_path, time_offset in chunks: - subtitle = _transcribe_single(chunk_path, language=language, settings=settings) + subtitle = _transcribe_single( + chunk_path, language=language, settings=settings, prompt=prompt + ) offset_td = timedelta(seconds=time_offset) for entry in subtitle.entries: all_entries.append( diff --git a/src/bilingualsub/core/translator.py b/src/bilingualsub/core/translator.py index ab2d13d..c84ea9b 100644 --- a/src/bilingualsub/core/translator.py +++ b/src/bilingualsub/core/translator.py @@ -4,12 +4,17 @@ import time from collections.abc import Callable from dataclasses import dataclass +from json import JSONDecodeError, loads +from urllib.parse import urlparse import structlog from agno.agent import Agent +from agno.models.base import Model +from agno.models.openai import OpenAIChat from bilingualsub.core.subtitle import Subtitle, SubtitleEntry from bilingualsub.utils.config import ( + Settings, get_groq_api_key, get_openai_api_key, get_settings, @@ -18,12 +23,15 @@ logger = structlog.get_logger() _BATCH_SIZE = 10 -_CONTEXT_SIZE = 3 # Number of previous entries to include as context +_CONTEXT_SIZE = 5 # Number of previous entries to include as context _LOOKAHEAD_SIZE = 3 # Number of upcoming entries to include as forward context _MAX_RETRIES = 5 -_PARTIAL_CONTEXT_WINDOW = 2 +_PARTIAL_CONTEXT_WINDOW = 5 _MAX_METADATA_TITLE_CHARS = 200 _MAX_METADATA_DESC_CHARS = 1200 +_GROQ_PREFIX = "groq:" +_OPENAI_PREFIX = "openai:" +_PROXY_PLACEHOLDER_API_KEY = "dummy" # pragma: allowlist secret class TranslationError(Exception): @@ -47,19 +55,80 @@ class RetranslateEntry: translated: str = "" -def _ensure_translator_api_key(translator_model: str) -> None: +@dataclass +class RetranslateResult: + """Structured result from partial re-translation.""" + + index: int + original: str + translated: str + + +def _is_openai_model(model_str: str) -> bool: + return model_str.strip().lower().startswith(_OPENAI_PREFIX) + + +def _ensure_translator_api_key(settings: Settings) -> None: """Validate API key for managed translator providers. + Skips the OpenAI key check when a proxy base URL is configured, + since proxies supply their own authentication. + Raises: ValueError: If required provider key is missing. """ - model_prefix = translator_model.strip().lower() - if model_prefix.startswith("groq:"): + model_str = settings.translator_model + if model_str.strip().lower().startswith(_GROQ_PREFIX): get_groq_api_key() - elif model_prefix.startswith("openai:"): + elif _is_openai_model(model_str) and not settings.openai_base_url: get_openai_api_key() +def _build_model(settings: Settings) -> str | Model: + """Build an Agno model instance or model string for the translator. + + When the translator model has an ``openai:`` prefix AND a custom + ``OPENAI_BASE_URL`` is configured, constructs an :class:`OpenAIChat` model + pointed at the proxy endpoint. This allows OpenAI-compatible proxies + (e.g. CLIProxyAPI) to be used without touching the Agno provider registry. + + In all other cases the raw model string is returned and Agno handles + provider resolution itself (existing behavior). + """ + model_str = settings.translator_model + if _is_openai_model(model_str) and settings.openai_base_url: + # _is_openai_model lowercases; slice original to preserve casing + model_id = model_str.strip()[len(_OPENAI_PREFIX) :] + return OpenAIChat( + id=model_id, + base_url=settings.openai_base_url, + api_key=settings.openai_api_key or _PROXY_PLACEHOLDER_API_KEY, + ) + return model_str + + +def _model_log_metadata(settings: Settings) -> dict[str, str | None]: + """Return safe model metadata for structured logs.""" + model_str = settings.translator_model.strip() + provider_kind = "agno" + model_id = model_str + lower_model = model_str.lower() + if lower_model.startswith(_GROQ_PREFIX): + provider_kind = "groq" + model_id = model_str[len(_GROQ_PREFIX) :] + elif lower_model.startswith(_OPENAI_PREFIX): + provider_kind = "openai" + model_id = model_str[len(_OPENAI_PREFIX) :] + + parsed_base_url = urlparse(settings.openai_base_url or "") + base_url_host = parsed_base_url.hostname if parsed_base_url.hostname else None + return { + "model_id": model_id, + "provider_kind": provider_kind, + "base_url_host": base_url_host, + } + + def _compact_text(text: str) -> str: """Normalize whitespace while preserving readable punctuation.""" return re.sub(r"\s+", " ", text).strip() @@ -121,6 +190,63 @@ def _strip_number_prefix(text: str) -> str: return re.sub(r"^\s*\d+\s*[.):\uff0e]\s*", "", text, count=1) +def _strip_json_fence(text: str) -> str: + """Remove a Markdown JSON fence if the model wrapped the response.""" + stripped = text.strip() + if not stripped.startswith("```"): + return stripped + lines = stripped.splitlines() + if len(lines) >= 3 and lines[-1].strip() == "```": + return "\n".join(lines[1:-1]).strip() + return stripped + + +def _parse_retranslate_response( + response_text: str, + *, + expected_index: int, +) -> RetranslateResult: + """Parse structured partial re-translation output.""" + cleaned = _strip_json_fence(response_text) + try: + payload = loads(cleaned) + except JSONDecodeError as err: + raise TranslationError( + f"Could not parse re-translation JSON for entry {expected_index}" + ) from err + + if not isinstance(payload, dict): + raise TranslationError( + f"Expected re-translation JSON object for entry {expected_index}" + ) + + try: + index = int(payload["index"]) + except (KeyError, TypeError, ValueError) as err: + raise TranslationError( + f"Invalid re-translation index for entry {expected_index}" + ) from err + + if index != expected_index: + raise TranslationError( + f"Expected re-translation index {expected_index}, got {index}" + ) + + original = str(payload.get("original") or "").strip() + if not original: + raise TranslationError( + "Missing original text in re-translation response " + f"for entry {expected_index}" + ) + + translated = str(payload.get("translated") or "").strip() + if not translated: + raise TranslationError( + f"Empty re-translation response for entry {expected_index}" + ) + return RetranslateResult(index=index, original=original, translated=translated) + + def _check_rate_limit(response_text: str) -> None: """Raise RateLimitError if response contains rate limit error. @@ -223,19 +349,27 @@ def _translate_batch( prompt = ( f"{context_section}" f"將以下編號字幕從{source_lang}翻譯成{target_lang}。\n" - f"只回傳編號翻譯,每行一條,編號與原文一致。\n\n" # noqa: RUF001 + f"只回傳編號翻譯,每行一條,編號與原文一致。\n" # noqa: RUF001 + "若原文專有名詞疑似語音辨識錯字,請依上文、下文、影片背景與術語表修正後翻譯。" # noqa: RUF001 + "例如同一影片已出現的品牌、人名、產品名與網域應保持一致。\n\n" f"{numbered_lines}" f"{lookahead_section}" ) logger.debug( - "Batch translation prompt (entries %d-%d):\n%s", - batch[0].index, - batch[-1].index, - prompt, + "translation_batch_request", + source_lang=source_lang, + target_lang=target_lang, + entry_count=len(batch), + batch_start_index=batch[0].index, + batch_end_index=batch[-1].index, + has_context=bool(context), + lookahead_count=len(lookahead or []), ) + started_at = time.monotonic() response = translator.run(prompt) + duration_ms = round((time.monotonic() - started_at) * 1000) response_text = response.content.strip() if response.content else "" if not response_text: raise TranslationError("Empty batch translation response") @@ -243,10 +377,14 @@ def _translate_batch( _check_rate_limit(response_text) logger.debug( - "Batch translation response (entries %d-%d):\n%s", - batch[0].index, - batch[-1].index, - response_text, + "translation_batch_response", + source_lang=source_lang, + target_lang=target_lang, + entry_count=len(batch), + batch_start_index=batch[0].index, + batch_end_index=batch[-1].index, + duration_ms=duration_ms, + response_chars=len(response_text), ) return _parse_batch_response(response_text, len(batch)) @@ -290,10 +428,11 @@ def _translate_one_by_one( f"{entry.index}: {entry.text}" ) logger.debug( - "One-by-one translation for entry %d: '%s' -> '%s'", - entry.index, - entry.text, - translated_text, + "translation_one_by_one_entry_completed", + index=entry.index, + source_lang=source_lang, + target_lang=target_lang, + response_chars=len(translated_text), ) results.append(translated_text) return results @@ -334,9 +473,10 @@ def translate_subtitle( ValueError: If provider API key is missing """ settings = get_settings() - _ensure_translator_api_key(settings.translator_model) + _ensure_translator_api_key(settings) + model_metadata = _model_log_metadata(settings) translator = Agent( - model=settings.translator_model, + model=_build_model(settings), description=_build_translator_description( source_lang=source_lang, target_lang=target_lang, @@ -348,16 +488,29 @@ def translate_subtitle( entries = subtitle.entries translated_texts: list[str] = [] + logger.info( + "translation_started", + **model_metadata, + source_lang=source_lang, + target_lang=target_lang, + entry_count=len(entries), + batch_size=_BATCH_SIZE, + ) + started_at = time.monotonic() for i in range(0, len(entries), _BATCH_SIZE): batch = entries[i : i + _BATCH_SIZE] logger.debug( - "Processing batch %d/%d (entries %d-%d)", - i // _BATCH_SIZE + 1, - (len(entries) + _BATCH_SIZE - 1) // _BATCH_SIZE, - batch[0].index, - batch[-1].index, + "translation_batch_started", + **model_metadata, + source_lang=source_lang, + target_lang=target_lang, + batch_number=i // _BATCH_SIZE + 1, + batch_count=(len(entries) + _BATCH_SIZE - 1) // _BATCH_SIZE, + batch_start_index=batch[0].index, + batch_end_index=batch[-1].index, + entry_count=len(batch), ) # Collect context from previously translated entries @@ -393,16 +546,19 @@ def translate_subtitle( except (TranslationError, Exception) as exc: # Fallback to one-by-one for non-rate-limit errors logger.warning( - "Batch translation failed for entries %d-%d, " - "falling back to one-by-one: %s", - i + 1, - i + len(batch), - exc, + "translation_batch_fallback", + **model_metadata, + source_lang=source_lang, + target_lang=target_lang, + batch_start_index=batch[0].index, + batch_end_index=batch[-1].index, + entry_count=len(batch), + error_type=type(exc).__name__, ) logger.debug( - "Falling back to one-by-one for entries %d-%d", - i + 1, - i + len(batch), + "translation_one_by_one_fallback_started", + batch_start_index=batch[0].index, + batch_end_index=batch[-1].index, ) batch_translations = _translate_one_by_one( translator, batch, source_lang, target_lang @@ -416,12 +572,13 @@ def translate_subtitle( except RateLimitError as exc: if attempt < _MAX_RETRIES: logger.warning( - "Rate limited at entries %d-%d (attempt %d/%d), waiting %.0fs", - batch[0].index, - batch[-1].index, - attempt + 1, - _MAX_RETRIES, - exc.retry_after, + "translation_rate_limited", + **model_metadata, + batch_start_index=batch[0].index, + batch_end_index=batch[-1].index, + attempt=attempt + 1, + max_retries=_MAX_RETRIES, + retry_after_seconds=exc.retry_after, ) if on_rate_limit is not None: on_rate_limit(exc.retry_after, attempt + 1, _MAX_RETRIES) @@ -446,9 +603,61 @@ def translate_subtitle( for entry, text in zip(entries, translated_texts, strict=True) ] + logger.info( + "translation_completed", + **model_metadata, + source_lang=source_lang, + target_lang=target_lang, + entry_count=len(entries), + duration_ms=round((time.monotonic() - started_at) * 1000), + ) + return Subtitle(entries=translated_entries) +def _build_retranslate_prompt( + *, + target_entry: RetranslateEntry, + prev_entries: list[RetranslateEntry], + next_entries: list[RetranslateEntry], + normalized_user_context: str, + source_lang: str, + target_lang: str, +) -> str: + """Build the partial re-translation prompt for one selected entry.""" + sections: list[str] = [] + if prev_entries: + prev_lines = "\n".join( + f"- {entry.original} → {entry.translated or '(待翻譯)'}" + for entry in prev_entries + ) + sections.append(f"【上文參考】\n{prev_lines}") + + if next_entries: + next_lines = "\n".join( + f"- {entry.original} → {entry.translated or '(待翻譯)'}" + for entry in next_entries + ) + sections.append(f"【下文參考】\n{next_lines}") + + if normalized_user_context: + sections.append(f"【使用者補充上下文】\n{normalized_user_context}") + + prompt_sections = "\n\n".join(sections) + instruction = ( + f"請將以下字幕從{source_lang}翻譯成{target_lang}。\n" + "只回傳一個 JSON 物件,不要加 Markdown、引號外文字或任何說明。\n" # noqa: RUF001 + '格式:{"index": 數字, "original": "修正後原文", ' # noqa: RUF001 + '"translated": "目標語言翻譯"}。\n' + "若原文專有名詞疑似語音辨識錯字,請依上文、下文、影片背景、術語表與使用者補充上下文修正後翻譯。" # noqa: RUF001 + "例如同一影片已出現的品牌、人名、產品名與網域應保持一致。\n\n" + f"index: {target_entry.index}\n" + f"原文: {target_entry.original}\n" + f"目前翻譯(可修正): {target_entry.translated or '(空)'}" # noqa: RUF001 + ) + return (f"{prompt_sections}\n\n" if prompt_sections else "") + instruction + + def retranslate_entries( *, entries: list[RetranslateEntry], @@ -459,7 +668,7 @@ def retranslate_entries( video_description: str = "", glossary_text: str = "", user_context: str | None = None, -) -> dict[int, str]: +) -> dict[int, RetranslateResult]: """Re-translate selected subtitle entries with local context. Args: @@ -472,7 +681,8 @@ def retranslate_entries( user_context: Optional extra context provided by user. Returns: - Mapping: entry index -> translated text. + Mapping: entry index -> structured result containing corrected source and + translated text. Raises: ValueError: If request payload is invalid. @@ -491,9 +701,10 @@ def retranslate_entries( raise ValueError(f"selected_indices not found: {missing}") settings = get_settings() - _ensure_translator_api_key(settings.translator_model) + _ensure_translator_api_key(settings) + model_metadata = _model_log_metadata(settings) translator = Agent( - model=settings.translator_model, + model=_build_model(settings), description=_build_translator_description( source_lang=source_lang, target_lang=target_lang, @@ -504,38 +715,41 @@ def retranslate_entries( ) normalized_user_context = _compact_text(user_context or "") - results: dict[int, str] = {} + results: dict[int, RetranslateResult] = {} + logger.info( + "retranslation_started", + **model_metadata, + source_lang=source_lang, + target_lang=target_lang, + entry_count=len(entries), + selected_indices_count=len(ordered_indices), + ) + retranslation_started_at = time.monotonic() for target_index in ordered_indices: + entry_started_at = time.monotonic() position = position_by_index[target_index] target_entry = entries[position] prev_entries = entries[max(0, position - _PARTIAL_CONTEXT_WINDOW) : position] next_entries = entries[position + 1 : position + 1 + _PARTIAL_CONTEXT_WINDOW] + prompt = _build_retranslate_prompt( + target_entry=target_entry, + prev_entries=prev_entries, + next_entries=next_entries, + normalized_user_context=normalized_user_context, + source_lang=source_lang, + target_lang=target_lang, + ) - sections: list[str] = [] - if prev_entries: - prev_lines = "\n".join( - f"- {entry.original} → {entry.translated or '(待翻譯)'}" - for entry in prev_entries - ) - sections.append(f"【上文參考】\n{prev_lines}") - - if next_entries: - next_lines = "\n".join( - f"- {entry.original} → {entry.translated or '(待翻譯)'}" - for entry in next_entries - ) - sections.append(f"【下文參考】\n{next_lines}") - - if normalized_user_context: - sections.append(f"【使用者補充上下文】\n{normalized_user_context}") - - prompt_sections = "\n\n".join(sections) - prompt = (f"{prompt_sections}\n\n" if prompt_sections else "") + ( - f"請將以下字幕從{source_lang}翻譯成{target_lang}。\n" - "只回傳單行翻譯內容,不要加編號、引號或任何說明。\n\n" # noqa: RUF001 - f"原文: {target_entry.original}\n" - f"目前翻譯(可修正): {target_entry.translated or '(空)'}" # noqa: RUF001 + logger.debug( + "retranslation_entry_request", + **model_metadata, + source_lang=source_lang, + target_lang=target_lang, + index=target_index, + previous_context_count=len(prev_entries), + next_context_count=len(next_entries), + has_user_context=bool(normalized_user_context), ) for attempt in range(_MAX_RETRIES + 1): @@ -547,22 +761,29 @@ def retranslate_entries( f"Empty re-translation response for entry {target_index}" ) _check_rate_limit(response_text) - cleaned = _strip_number_prefix(response_text).strip() - if not cleaned: - raise TranslationError( - f"Empty re-translation response for entry {target_index}" - ) - results[target_index] = cleaned + results[target_index] = _parse_retranslate_response( + response_text, + expected_index=target_index, + ) + logger.debug( + "retranslation_entry_response", + **model_metadata, + source_lang=source_lang, + target_lang=target_lang, + index=target_index, + duration_ms=round((time.monotonic() - entry_started_at) * 1000), + response_chars=len(response_text), + ) break except RateLimitError as exc: if attempt < _MAX_RETRIES: logger.warning( - "Rate limited during re-translation for entry %d " - "(attempt %d/%d), waiting %.0fs", - target_index, - attempt + 1, - _MAX_RETRIES, - exc.retry_after, + "retranslation_rate_limited", + **model_metadata, + index=target_index, + attempt=attempt + 1, + max_retries=_MAX_RETRIES, + retry_after_seconds=exc.retry_after, ) time.sleep(exc.retry_after) else: @@ -573,4 +794,14 @@ def retranslate_entries( else: raise TranslationError(f"Failed to re-translate entry {target_index}") + logger.info( + "retranslation_completed", + **model_metadata, + source_lang=source_lang, + target_lang=target_lang, + entry_count=len(entries), + selected_indices_count=len(ordered_indices), + duration_ms=round((time.monotonic() - retranslation_started_at) * 1000), + ) + return results diff --git a/src/bilingualsub/utils/config.py b/src/bilingualsub/utils/config.py index 8fffec1..ab26abe 100644 --- a/src/bilingualsub/utils/config.py +++ b/src/bilingualsub/utils/config.py @@ -2,6 +2,7 @@ from functools import lru_cache +from pydantic import field_validator from pydantic_settings import BaseSettings, SettingsConfigDict @@ -11,6 +12,7 @@ class Settings(BaseSettings): Attributes: groq_api_key: API key for Groq services (Whisper + LLM) openai_api_key: API key for OpenAI services + openai_base_url: Base URL for OpenAI-compatible proxy (e.g. http://localhost:8317/v1) transcriber_provider: Whisper provider ("groq" or "openai") transcriber_model: Whisper model name translator_model: Agno model string (e.g. "ollama:model_id", "groq:model_id") @@ -20,6 +22,12 @@ class Settings(BaseSettings): groq_api_key: str = "" openai_api_key: str = "" + openai_base_url: str = "" + + @field_validator("openai_base_url") + @classmethod + def strip_trailing_slash(cls, v: str) -> str: + return v.rstrip("/") transcriber_provider: str = "groq" transcriber_model: str = "whisper-large-v3-turbo" diff --git a/src/bilingualsub/utils/ffmpeg.py b/src/bilingualsub/utils/ffmpeg.py index e00fbd0..08319f1 100644 --- a/src/bilingualsub/utils/ffmpeg.py +++ b/src/bilingualsub/utils/ffmpeg.py @@ -19,6 +19,7 @@ _FONT_EN_BOLD = _ASSETS_DIR / "LINESeedSans_Bd.ttf" _FONT_ZH_REGULAR = _ASSETS_DIR / "NotoSansTC-Regular.ttf" _FONT_ZH_BOLD = _ASSETS_DIR / "NotoSansTC-Bold.ttf" +_FONT_ZH_FALLBACK = "Noto Sans CJK TC" def _font_arg(fontfile: Path, fallback_name: str) -> str: @@ -590,7 +591,7 @@ def _next_start() -> float: blocks.append( _dt( "原始影片來自", - _font_arg(_FONT_ZH_REGULAR, "serif"), + _font_arg(_FONT_ZH_REGULAR, _FONT_ZH_FALLBACK), max(1, int(height / 42)), "white@0.6", x_left, @@ -636,7 +637,7 @@ def _next_start() -> float: blocks.append( _dt( video_title, - _font_arg(_FONT_ZH_REGULAR, "serif"), + _font_arg(_FONT_ZH_REGULAR, _FONT_ZH_FALLBACK), max(1, int(height / 34)), "white@0.7", x_left, @@ -673,7 +674,7 @@ def _next_start() -> float: blocks.append( _dt( line, - _font_arg(_FONT_ZH_REGULAR, "serif"), + _font_arg(_FONT_ZH_REGULAR, _FONT_ZH_FALLBACK), max(1, int(height / 45)), "white@0.45", x_left, @@ -729,6 +730,10 @@ def _next_start() -> float: "lavfi", "-i", f"color=c=black:s={width}x{height}:r={fps}:d={duration}", + "-f", + "lavfi", + "-i", + "anullsrc=channel_layout=stereo:sample_rate=48000", "-vf", vf, "-c:v", @@ -737,7 +742,11 @@ def _next_start() -> float: "23", "-preset", "fast", - "-an", + "-c:a", + "aac", + "-b:a", + "128k", + "-shortest", "-progress", "pipe:1", "-y", diff --git a/tests/integration/test_intro_concat_audio.py b/tests/integration/test_intro_concat_audio.py new file mode 100644 index 0000000..228cc01 --- /dev/null +++ b/tests/integration/test_intro_concat_audio.py @@ -0,0 +1,101 @@ +"""Regression tests for intro + video concat audio preservation.""" + +from __future__ import annotations + +import json +import shutil +import subprocess +from typing import TYPE_CHECKING + +import pytest + +from bilingualsub.utils.ffmpeg import concat_videos, generate_intro + +if TYPE_CHECKING: + from pathlib import Path + + +def _has_ffmpeg_tools() -> bool: + return shutil.which("ffmpeg") is not None and shutil.which("ffprobe") is not None + + +requires_ffmpeg_tools = pytest.mark.skipif( + not _has_ffmpeg_tools(), + reason="ffmpeg and ffprobe are required for media regression tests", +) + + +def _create_video_with_audio(path: Path) -> None: + subprocess.run( + [ + "ffmpeg", + "-f", + "lavfi", + "-i", + "testsrc2=size=320x180:rate=24:d=1", + "-f", + "lavfi", + "-i", + "sine=frequency=440:sample_rate=48000:d=1", + "-c:v", + "libx264", + "-pix_fmt", + "yuv420p", + "-c:a", + "aac", + "-ac", + "2", + "-shortest", + "-y", + str(path), + ], + check=True, + capture_output=True, + text=True, + ) + + +def _audio_stream_count(path: Path) -> int: + result = subprocess.run( + [ + "ffprobe", + "-v", + "error", + "-show_streams", + "-select_streams", + "a", + "-of", + "json", + str(path), + ], + check=True, + capture_output=True, + text=True, + ) + return len(json.loads(result.stdout).get("streams", [])) + + +@pytest.mark.integration +@requires_ffmpeg_tools +def test_intro_concat_preserves_main_video_audio(tmp_path: Path) -> None: + """Regression: a silent intro must not make the final concat output video-only.""" + intro = tmp_path / "intro.mp4" + main = tmp_path / "main.mp4" + final = tmp_path / "final.mp4" + + generate_intro( + intro, + width=320, + height=180, + fps=24.0, + channel="ClaudeDevs", + video_title="Artifacts in Claude Code", + video_url="https://x.com/ClaudeDevs/status/2072770790114914317?s=20", + channel_url="https://x.com/ClaudeDevs", + duration=1.0, + ) + _create_video_with_audio(main) + + concat_videos(intro, main, final) + + assert _audio_stream_count(final) == 1 diff --git a/tests/unit/api/test_pipeline.py b/tests/unit/api/test_pipeline.py index 6cc3ea2..b64e3b6 100644 --- a/tests/unit/api/test_pipeline.py +++ b/tests/unit/api/test_pipeline.py @@ -106,6 +106,7 @@ async def test_successful_pipeline( translate_call_kwargs = mock_translate.call_args.kwargs assert "on_progress" in translate_call_kwargs assert callable(translate_call_kwargs["on_progress"]) + assert mock_transcribe.call_args.kwargs["prompt"] == "Test Video" @patch("bilingualsub.api.pipeline.download_video") async def test_download_error(self, mock_download) -> None: diff --git a/tests/unit/api/test_routes.py b/tests/unit/api/test_routes.py index 32efc77..596271b 100644 --- a/tests/unit/api/test_routes.py +++ b/tests/unit/api/test_routes.py @@ -10,6 +10,7 @@ from bilingualsub.api.constants import FileType, JobStatus from bilingualsub.api.jobs import Job, JobManager from bilingualsub.api.routes import _build_download_filename, _sanitize_filename +from bilingualsub.core import RetranslateResult from bilingualsub.core.glossary import GlossaryManager @@ -196,7 +197,13 @@ async def test_partial_retranslate_success(self, client: AsyncClient, app) -> No job.target_lang = "zh-TW" with patch("bilingualsub.api.routes.retranslate_entries") as mock_retranslate: - mock_retranslate.return_value = {2: "修正版第二句"} + mock_retranslate.return_value = { + 2: RetranslateResult( + index=2, + original="Corrected Line 2", + translated="修正版第二句", + ) + } response = await client.post( f"/api/jobs/{job_id}/retranslate", json={ @@ -211,7 +218,13 @@ async def test_partial_retranslate_success(self, client: AsyncClient, app) -> No assert response.status_code == 200 data = response.json() - assert data["results"] == [{"index": 2, "translated": "修正版第二句"}] + assert data["results"] == [ + { + "index": 2, + "original": "Corrected Line 2", + "translated": "修正版第二句", + } + ] call_kwargs = mock_retranslate.call_args.kwargs assert call_kwargs["glossary_text"] == "" # empty glossary diff --git a/tests/unit/api/test_schemas.py b/tests/unit/api/test_schemas.py index 9228e57..b9947cf 100644 --- a/tests/unit/api/test_schemas.py +++ b/tests/unit/api/test_schemas.py @@ -9,6 +9,7 @@ JobCreateRequest, JobCreateResponse, JobStatusResponse, + PartialRetranslateItem, PartialRetranslateRequest, SSEProgressData, StartSubtitleRequest, @@ -185,3 +186,16 @@ def test_selected_indices_must_exist(self) -> None: {"index": 1, "original": "Line 1", "translated": "第一句"}, ], ) + + def test_response_item_includes_original(self) -> None: + item = PartialRetranslateItem( + index=2, + original="Corrected Line 2", + translated="修正版第二句", + ) + + assert item.model_dump() == { + "index": 2, + "original": "Corrected Line 2", + "translated": "修正版第二句", + } diff --git a/tests/unit/core/test_glossary.py b/tests/unit/core/test_glossary.py index 4598683..e1f5c67 100644 --- a/tests/unit/core/test_glossary.py +++ b/tests/unit/core/test_glossary.py @@ -2,7 +2,11 @@ import pytest -from bilingualsub.core.glossary import GlossaryEntry, GlossaryError, GlossaryManager +from bilingualsub.core.glossary import ( + GlossaryEntry, + GlossaryError, + GlossaryManager, +) @pytest.mark.unit diff --git a/tests/unit/core/test_transcriber.py b/tests/unit/core/test_transcriber.py index 1fd37f5..b0d9222 100644 --- a/tests/unit/core/test_transcriber.py +++ b/tests/unit/core/test_transcriber.py @@ -6,7 +6,11 @@ import pytest from bilingualsub.core.subtitle import Subtitle -from bilingualsub.core.transcriber import TranscriptionError, transcribe_audio +from bilingualsub.core.transcriber import ( + TranscriptionError, + build_whisper_prompt, + transcribe_audio, +) from bilingualsub.utils.config import get_settings @@ -79,7 +83,7 @@ def test_transcribe_valid_audio_file( # Verify transcription API was called mock_client.audio.transcriptions.create.assert_called_once() - call_kwargs = mock_client.audio.transcriptions.create.call_args[1] + call_kwargs = mock_client.audio.transcriptions.create.call_args.kwargs assert call_kwargs["model"] == "whisper-large-v3-turbo" assert call_kwargs["response_format"] == "verbose_json" assert call_kwargs["language"] == "en" @@ -115,7 +119,7 @@ def test_transcribe_with_chinese_language(self, tmp_path, mock_groq, monkeypatch result = transcribe_audio(audio_path, language="zh") # Verify language parameter was passed - call_kwargs = mock_client.audio.transcriptions.create.call_args[1] + call_kwargs = mock_client.audio.transcriptions.create.call_args.kwargs assert call_kwargs["language"] == "zh" # Verify result @@ -304,7 +308,7 @@ def test_various_audio_formats(self, tmp_path, mock_groq, monkeypatch): assert result.entries[0].text == "Test" # Verify correct filename was sent - call_kwargs = mock_client.audio.transcriptions.create.call_args[1] + call_kwargs = mock_client.audio.transcriptions.create.call_args.kwargs assert call_kwargs["file"][0] == f"audio{fmt}" def test_empty_api_key_raises_error(self, tmp_path, monkeypatch, no_env_file): @@ -337,7 +341,7 @@ def test_default_language_is_english(self, tmp_path, mock_groq, monkeypatch): transcribe_audio(audio_path) - call_kwargs = mock_client.audio.transcriptions.create.call_args[1] + call_kwargs = mock_client.audio.transcriptions.create.call_args.kwargs assert call_kwargs["language"] == "en" def test_transcribe_with_openai_provider( @@ -382,7 +386,7 @@ def test_transcribe_with_custom_model( transcribe_audio(audio_path) - call_kwargs = mock_client.audio.transcriptions.create.call_args[1] + call_kwargs = mock_client.audio.transcriptions.create.call_args.kwargs assert call_kwargs["model"] == "whisper-large-v3" def test_unknown_provider_raises_error(self, tmp_path, monkeypatch): @@ -417,3 +421,140 @@ def test_small_file_does_not_trigger_chunking( mock_split.assert_not_called() assert isinstance(result, Subtitle) assert len(result.entries) == 2 + + +@pytest.mark.unit +class TestWhisperPrompt: + """Test cases for build_whisper_prompt and prompt passthrough in transcription.""" + + @pytest.fixture(autouse=True) + def clear_settings_cache(self): + get_settings.cache_clear() + yield + get_settings.cache_clear() + + @pytest.fixture + def mock_groq(self): + with patch("bilingualsub.core.transcriber.Groq") as mock: + yield mock + + @pytest.fixture + def valid_verbose_json_response(self): + response = Mock() + response.segments = [ + {"id": 0, "start": 0.0, "end": 2.0, "text": " Hello world"}, + ] + return response + + def test_build_whisper_prompt_with_title_only(self): + result = build_whisper_prompt(video_title="My Product Review") + assert result == "My Product Review" + + def test_build_whisper_prompt_with_whitespace_title(self): + result = build_whisper_prompt(video_title=" My Product Review ") + assert result == "My Product Review" + + def test_build_whisper_prompt_empty(self): + result = build_whisper_prompt() + assert result is None + + def test_build_whisper_prompt_whitespace_only(self): + result = build_whisper_prompt(video_title=" ") + assert result is None + + def test_build_whisper_prompt_truncates_long_input(self): + long_title = "".join(str(i % 10) for i in range(900)) + result = build_whisper_prompt(video_title=long_title) + assert result == long_title[:800] + assert len(result) == 800 + + def test_transcribe_single_passes_prompt_to_api( + self, tmp_path, mock_groq, valid_verbose_json_response, monkeypatch + ): + monkeypatch.setenv("GROQ_API_KEY", "test-api-key") + + audio_path = tmp_path / "audio.mp3" + audio_path.write_bytes(b"fake audio content") + + mock_client = MagicMock() + mock_groq.return_value = mock_client + mock_client.audio.transcriptions.create.return_value = ( + valid_verbose_json_response + ) + + transcribe_audio(audio_path, prompt="My Product Review. Claude, GPT") + + call_kwargs = mock_client.audio.transcriptions.create.call_args.kwargs + assert call_kwargs["prompt"] == "My Product Review. Claude, GPT" + + def test_transcribe_single_omits_prompt_when_none( + self, tmp_path, mock_groq, valid_verbose_json_response, monkeypatch + ): + monkeypatch.setenv("GROQ_API_KEY", "test-api-key") + + audio_path = tmp_path / "audio.mp3" + audio_path.write_bytes(b"fake audio content") + + mock_client = MagicMock() + mock_groq.return_value = mock_client + mock_client.audio.transcriptions.create.return_value = ( + valid_verbose_json_response + ) + + transcribe_audio(audio_path) + + call_kwargs = mock_client.audio.transcriptions.create.call_args.kwargs + assert "prompt" not in call_kwargs + + def test_transcribe_filters_zero_duration_segments( + self, tmp_path, mock_groq, monkeypatch + ): + monkeypatch.setenv("GROQ_API_KEY", "test-api-key") + + audio_path = tmp_path / "audio.mp3" + audio_path.write_bytes(b"fake audio content") + + response = Mock() + response.segments = [ + {"id": 0, "start": 0.0, "end": 2.0, "text": " Valid segment"}, + {"id": 1, "start": 3.0, "end": 3.0, "text": " Zero duration"}, + {"id": 2, "start": 5.0, "end": 4.0, "text": " Negative duration"}, + {"id": 3, "start": 6.0, "end": 6.5, "text": " "}, + {"id": 4, "start": 7.0, "end": 9.0, "text": " Another valid"}, + ] + + mock_client = MagicMock() + mock_groq.return_value = mock_client + mock_client.audio.transcriptions.create.return_value = response + + result = transcribe_audio(audio_path) + + assert len(result.entries) == 2 + assert result.entries[0].text == "Valid segment" + assert result.entries[1].text == "Another valid" + assert result.entries[0].index == 1 + assert result.entries[1].index == 2 + + def test_transcribe_raises_when_all_segments_filtered( + self, tmp_path, mock_groq, monkeypatch + ): + monkeypatch.setenv("GROQ_API_KEY", "test-api-key") + + audio_path = tmp_path / "audio.mp3" + audio_path.write_bytes(b"fake audio content") + + response = Mock() + response.segments = [ + {"id": 0, "start": 1.0, "end": 1.0, "text": " Zero duration"}, + {"id": 1, "start": 3.0, "end": 2.0, "text": " Negative"}, + {"id": 2, "start": 5.0, "end": 6.0, "text": " "}, + ] + + mock_client = MagicMock() + mock_groq.return_value = mock_client + mock_client.audio.transcriptions.create.return_value = response + + with pytest.raises( + TranscriptionError, match="No valid segments after filtering" + ): + transcribe_audio(audio_path) diff --git a/tests/unit/core/test_translator.py b/tests/unit/core/test_translator.py index bcc0e54..05507f0 100644 --- a/tests/unit/core/test_translator.py +++ b/tests/unit/core/test_translator.py @@ -4,12 +4,17 @@ from unittest.mock import Mock, patch import pytest +from agno.models.openai import OpenAIChat from bilingualsub.core.subtitle import Subtitle, SubtitleEntry from bilingualsub.core.translator import ( + _PROXY_PLACEHOLDER_API_KEY, RetranslateEntry, + RetranslateResult, TranslationError, + _build_model, _parse_batch_response, + _parse_retranslate_response, retranslate_entries, translate_subtitle, ) @@ -248,6 +253,87 @@ def test_parse_batch_response_missing_number(self): _parse_batch_response(response, 3) +class TestParseRetranslateResponse: + def test_parse_retranslate_response_json_object(self): + response = ( + '{"index": 2, "original": "OpenAI released GPT-5", ' + '"translated": "OpenAI 發布了 GPT-5"}' + ) + + result = _parse_retranslate_response( + response, + expected_index=2, + ) + + assert result == RetranslateResult( + index=2, + original="OpenAI released GPT-5", + translated="OpenAI 發布了 GPT-5", + ) + + def test_parse_retranslate_response_markdown_json_fence(self): + response = ( + '```json\n{"index": 2, "original": "Line two", "translated": "第二句"}\n```' + ) + + result = _parse_retranslate_response( + response, + expected_index=2, + ) + + assert result.original == "Line two" + assert result.translated == "第二句" + + @pytest.mark.parametrize( + "response", + [ + '{"results": [{"index": 2, "original": "Line two", "translated": "第二句"}]}', + '{"2": {"original": "Line two", "translated": "第二句"}}', + '[{"index": 2, "original": "Line two", "translated": "第二句"}]', + '"第二句"', + "2. 修正版第二句", + ], + ) + def test_parse_retranslate_response_rejects_unsupported_shapes(self, response): + with pytest.raises(TranslationError): + _parse_retranslate_response( + response, + expected_index=2, + ) + + @pytest.mark.parametrize( + "response", + [ + '{"original": "Line two", "translated": "第二句"}', + '{"index": "1.", "original": "Line two", "translated": "第二句"}', + ], + ) + def test_parse_retranslate_response_invalid_index_raises_translation_error( + self, response + ): + with pytest.raises(TranslationError, match="Invalid re-translation index"): + _parse_retranslate_response( + response, + expected_index=2, + ) + + @pytest.mark.parametrize( + "response", + [ + '{"index": 2, "translated": "第二句"}', + '{"index": 2, "original": "Line two"}', + ], + ) + def test_parse_retranslate_response_requires_original_and_translated( + self, response + ): + with pytest.raises(TranslationError): + _parse_retranslate_response( + response, + expected_index=2, + ) + + class TestBatchTranslation: """Test batch translation behavior.""" @@ -457,7 +543,9 @@ def make_response(*args, **kwargs): assert mock_translator.run.call_count == 2 second_prompt = mock_translator.run.call_args_list[1][0][0] assert "上文參考" in second_prompt - # Should contain entries from the end of first batch (last 3) + # Should contain entries from the end of first batch (last 5) + assert "Line 6" in second_prompt + assert "Line 7" in second_prompt assert "Line 8" in second_prompt assert "Line 9" in second_prompt assert "Line 10" in second_prompt @@ -709,7 +797,10 @@ def test_retranslate_entries_with_context(self): mock_translator = Mock() mock_agent.return_value = mock_translator mock_response = Mock() - mock_response.content = "修正版第二句" + mock_response.content = ( + '{"index": 2, "original": "Corrected Line 2", ' + '"translated": "修正版第二句"}' + ) mock_translator.run.return_value = mock_response result = retranslate_entries( @@ -718,13 +809,162 @@ def test_retranslate_entries_with_context(self): user_context="主題是太空探索", ) - assert result == {2: "修正版第二句"} + assert result == { + 2: RetranslateResult( + index=2, + original="Corrected Line 2", + translated="修正版第二句", + ) + } prompt = mock_translator.run.call_args[0][0] assert "上文參考" in prompt + assert "Line 1 → 第一句" in prompt assert "下文參考" in prompt + assert "Line 3 → 第三句" in prompt assert "主題是太空探索" in prompt + assert "index: 2" in prompt + assert "原文: Line 2" in prompt + assert "目前翻譯(可修正): 第二句" in prompt + + def test_retranslate_entries_requires_structured_json_result(self): + entries = [RetranslateEntry(index=1, original="Line 1", translated="第一句")] + + with patch("bilingualsub.core.translator.Agent") as mock_agent: + mock_translator = Mock() + mock_agent.return_value = mock_translator + mock_response = Mock() + mock_response.content = "修正版第一句" + mock_translator.run.return_value = mock_response + + with pytest.raises(TranslationError): + retranslate_entries(entries=entries, selected_indices=[1]) def test_retranslate_entries_invalid_index_raises_error(self): entries = [RetranslateEntry(index=1, original="Line 1", translated="第一句")] with pytest.raises(ValueError, match="selected_indices not found"): retranslate_entries(entries=entries, selected_indices=[2]) + + +@pytest.mark.unit +class TestBuildModel: + """Test cases for translator model selection behavior.""" + + @pytest.fixture(autouse=True) + def _clear_settings_cache(self): + get_settings.cache_clear() + yield + get_settings.cache_clear() + + def _translate_one_entry_with_agent_mock(self): + entries = [ + SubtitleEntry( + index=1, + start=timedelta(seconds=0), + end=timedelta(seconds=2), + text="Hello", + ) + ] + subtitle = Subtitle(entries=entries) + + with patch("bilingualsub.core.translator.Agent") as mock_agent: + mock_translator = Mock() + mock_agent.return_value = mock_translator + mock_response = Mock() + mock_response.content = "1. 你好" + mock_translator.run.return_value = mock_response + + translate_subtitle(subtitle) + + return mock_agent.call_args.kwargs["model"] + + def test_given_groq_model_when_translate_subtitle_then_uses_raw_model_string( + self, monkeypatch + ): + monkeypatch.setenv("TRANSLATOR_MODEL", "groq:openai/gpt-oss-120b") + monkeypatch.delenv("OPENAI_BASE_URL", raising=False) + get_settings.cache_clear() + + model_arg = self._translate_one_entry_with_agent_mock() + + assert model_arg == "groq:openai/gpt-oss-120b" + + def test_given_openai_model_without_base_url_when_translate_then_uses_raw_string( + self, monkeypatch + ): + monkeypatch.setenv("TRANSLATOR_MODEL", "openai:gpt-4o") + monkeypatch.setenv("OPENAI_API_KEY", "sk-test") + monkeypatch.delenv("OPENAI_BASE_URL", raising=False) + get_settings.cache_clear() + + model_arg = self._translate_one_entry_with_agent_mock() + + assert model_arg == "openai:gpt-4o" + + def test_given_base_url_with_trailing_slash_when_translate_then_slash_stripped( + self, monkeypatch + ): + monkeypatch.setenv("TRANSLATOR_MODEL", "openai:gpt-4o") + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + monkeypatch.setenv("OPENAI_BASE_URL", "http://localhost:3000/v1/") + get_settings.cache_clear() + + model_arg = _build_model(get_settings()) + + assert isinstance(model_arg, OpenAIChat) + assert model_arg.base_url == "http://localhost:3000/v1" + + def test_given_proxy_without_api_key_when_translate_subtitle_then_no_value_error( + self, monkeypatch + ): + # Guards: _ensure_translator_api_key must skip key check when proxy is configured, + # otherwise translate_subtitle raises ValueError before _build_model runs. + monkeypatch.setenv("TRANSLATOR_MODEL", "openai:any-model") + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.setenv("OPENAI_BASE_URL", "http://localhost:3000/v1") + get_settings.cache_clear() + + entries = [ + SubtitleEntry( + index=1, + start=timedelta(seconds=0), + end=timedelta(seconds=2), + text="Hello", + ) + ] + subtitle = Subtitle(entries=entries) + + with patch("bilingualsub.core.translator.Agent") as mock_agent: + mock_translator = Mock() + mock_agent.return_value = mock_translator + mock_response = Mock() + mock_response.content = "1. 你好" + mock_translator.run.return_value = mock_response + + # Primary assertion: no ValueError raised — proxy skips API key check + translate_subtitle(subtitle) + + def test_given_openai_model_with_proxy_when_build_model_then_uses_openai_chat( + self, monkeypatch + ): + monkeypatch.setenv("TRANSLATOR_MODEL", "openai:claude-sonnet-4-5") + monkeypatch.setenv("OPENAI_API_KEY", "cli-token") + monkeypatch.setenv("OPENAI_BASE_URL", "http://localhost:3000/v1") + get_settings.cache_clear() + + model_arg = _build_model(get_settings()) + + assert isinstance(model_arg, OpenAIChat) + assert model_arg.id == "claude-sonnet-4-5" + + def test_given_proxy_without_api_key_when_build_model_then_uses_placeholder_key( + self, monkeypatch + ): + monkeypatch.setenv("TRANSLATOR_MODEL", "openai:any-model") + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.setenv("OPENAI_BASE_URL", "http://localhost:3000/v1") + get_settings.cache_clear() + + model_arg = _build_model(get_settings()) + + assert isinstance(model_arg, OpenAIChat) + assert model_arg.api_key == _PROXY_PLACEHOLDER_API_KEY diff --git a/tests/unit/utils/test_ffmpeg_intro.py b/tests/unit/utils/test_ffmpeg_intro.py index 4ccb131..0610693 100644 --- a/tests/unit/utils/test_ffmpeg_intro.py +++ b/tests/unit/utils/test_ffmpeg_intro.py @@ -238,6 +238,34 @@ def test_when_channel_url_empty_then_vf_does_not_contain_channel_url_value( # Exactly 12 drawtext blocks when channel_url is omitted (13 when present) assert vf_value.count("drawtext=") == 12 + def test_generate_intro_uses_cjk_font_fallback_for_chinese_text( + self, tmp_path: Path, mock_intro_ffmpeg: dict, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Chinese intro text should target an installed CJK font, not generic serif.""" + output_path = tmp_path / "intro.mp4" + monkeypatch.setattr( + "bilingualsub.utils.ffmpeg._FONT_ZH_REGULAR", + tmp_path / "missing-noto-sans-tc.ttf", + ) + + generate_intro( + output_path, + width=1280, + height=720, + fps=30.0, + channel="ClaudeDevs", + video_title="ClaudeDevs - Artifacts in Claude Code", + video_url="https://x.com/ClaudeDevs/status/2072770790114914317?s=20", + channel_url="https://x.com/ClaudeDevs", + ) + + cmd = _get_popen_cmd(mock_intro_ffmpeg["popen"]) + vf_idx = cmd.index("-vf") + vf_value = cmd[vf_idx + 1] + + assert "font='Noto Sans CJK TC'" in vf_value + assert "font='serif'" not in vf_value + def test_when_ffmpeg_fails_then_raises_ffmpeg_error( self, tmp_path: Path, mock_intro_ffmpeg: dict ) -> None: @@ -285,6 +313,30 @@ def test_generate_intro_always_uses_libx264_regardless_of_platform( # VideoToolbox must NOT be used for intro assert "h264_videotoolbox" not in cmd + def test_generate_intro_includes_silent_audio_track_for_concat( + self, tmp_path: Path, mock_intro_ffmpeg: dict + ) -> None: + """Intro must include silent AAC audio so concat keeps the main video's audio.""" + output_path = tmp_path / "intro.mp4" + + generate_intro( + output_path, + width=1920, + height=1080, + fps=30.0, + channel="Ch", + video_title="T", + video_url="https://example.com", + ) + + cmd = _get_popen_cmd(mock_intro_ffmpeg["popen"]) + assert "anullsrc=channel_layout=stereo:sample_rate=48000" in cmd + assert "-an" not in cmd + assert "-c:a" in cmd + ca_idx = cmd.index("-c:a") + assert cmd[ca_idx + 1] == "aac" + assert "-shortest" in cmd + # --------------------------------------------------------------------------- # concat_videos