From 8d9eb71d1859ee8fe273574066925718411faf03 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 14 Jun 2026 06:14:59 +0000 Subject: [PATCH] =?UTF-8?q?feat(F9-2i):=20translate=20route=20=E6=8E=A5?= =?UTF-8?q?=E8=AA=B2=E7=A8=8B=20glossary=20=E5=9B=BA=E5=AE=9A=E8=AD=AF?= =?UTF-8?q?=E5=90=8D=EF=BC=88project=5Fid=20=E2=86=92=20to=5Ftranslation?= =?UTF-8?q?=5Frules=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit POST /localization/translate 加選填 project_id:給了即現讀該課 glossary 的固定譯名 (to_translation_rules)併進 glossary 規則送翻譯,讓在地化術語前後一致。 - TranslateRequest 加 project_id(optional,向後相容、既有 caller 零影響) - _glossary_lang_candidates:canonical 區域碼↔glossary 短碼對齊(en-US→en, zh-CN/zh-TW 完整碼本就是 key) - _course_glossary_rules / _merge_glossary:顯式 glossary 在前、課程在後合併 - fail-soft(RFC §5):沒 pid / 課不存在 / 無 glossary / 無該語言譯名 / 讀檔出錯 → 沿用現行行為,絕不讓翻譯失敗;glossary 讀檔走 to_thread(R-3) - 完全不碰 review gate / 狀態機(只影響術語怎麼譯) - 新增 tests/test_localization_glossary.py 9 測(全 mock translate、tmp 隔離) 本機全套 2673 passed(3 個 QR/journal 字型像素為容器缺 Noto CJK 假象,CI 權威)。 --- docs/PRODUCT_READINESS.md | 19 +++- server/routes/localization.py | 80 +++++++++++++- tests/test_localization_glossary.py | 158 ++++++++++++++++++++++++++++ 3 files changed, 250 insertions(+), 7 deletions(-) create mode 100644 tests/test_localization_glossary.py diff --git a/docs/PRODUCT_READINESS.md b/docs/PRODUCT_READINESS.md index 30e62b0..ea8d2ad 100644 --- a/docs/PRODUCT_READINESS.md +++ b/docs/PRODUCT_READINESS.md @@ -686,9 +686,22 @@ 收斂 None;`_run_render` wiring inner 期間掛上·出去還原·無 glossary 沿用全域,**全 offline 不打 API、不真跑 TTS**)。本機相關子集 105 passed、全套 2665 passed(3 個 QR/journal 字型像素為容器缺 Noto CJK 假象,CI 權威)。 - - ⏸️ **後續 offline slice**:F9-2i(翻譯 route 接 `to_translation_rules()`,同 `project_id` 關聯 - 讓在地化翻譯套固定譯名),見上 RFC §4。**自動建議術語**(掃教材抽術語)碰 Gemini 額度 = GATE, - 另寫 proposal 再做。 + - ✅ 2026-06-14 **F9-2i:翻譯 route 接 `to_translation_rules()` 完成(offline,F9-2 offline 收尾)**。 + `POST /localization/translate` 的 `TranslateRequest` 加**選填** `project_id`:給了 → 以 + `ProjectStore.get_glossary(project_id)` 現讀該課 glossary → `to_translation_rules(target_lang)` + 產固定譯名規則文字塊,與呼叫端顯式 `glossary` 合併(顯式在前、課程在後)後送 `translator. + translate(glossary=...)`,讓在地化翻譯術語前後一致。canonical 區域碼↔glossary 短碼對得上 + (`_glossary_lang_candidates`:完整碼優先、再退基底子標籤,`en-US`→`en`、`zh-CN`/`zh-TW` + 完整碼本就是 glossary key)。守 RFC §5 **fail-soft**:沒 `project_id` / 課不存在 + (`ProjectNotFoundError`)/ 無 glossary / 該語言無譯名 / 讀檔出錯 → 一律回空課程規則、沿用現行 + 行為,**絕不讓翻譯失敗**;glossary 讀檔沿 R-3 走 `to_thread` 不阻 event loop。**完全不碰 review + gate / 狀態機**(只影響「術語怎麼譯」,硬規則 #1)。`project_id` 為 optional=既有 caller 零影響、 + 向後相容。補 `tests/test_localization_glossary.py` 9 測(注入固定譯名含別名並排/顯式+課程合併順序/ + 完整區域碼命中/該語言無譯名不附/沒 pid·未知 pid·無 glossary·空白 pid fail-soft,**全 mock + translate 不打真 API、ProjectStore tmp 隔離**=offline-first)。本機相關子集 68 passed、全套 2673 + passed(3 個 QR/journal 字型像素為容器缺 Noto CJK 假象,CI 權威)。**前端 `LocalizeMenu` 傳 + `project_id`** 屬後續前端 slice(route 欄位選填、不破壞現況)。**自動建議術語**(掃教材抽術語) + 碰 Gemini 額度 = GATE,另寫 proposal 再做。F9-2 offline slice 至此到齊。 - [~] 🟡 **F9-3 本機可插拔模型後端**(GATE,= M 軸 Option B 的本機 provider)— 支援 **Ollama 等本機 LLM** 跑文字(大綱/旁白/翻譯),老師可零雲端成本跑(翻譯已用本機 translategemma 驗過路子)。**依賴 M-4 provider 介面就緒**後加 ollama adapter + 設定頁可選 diff --git a/server/routes/localization.py b/server/routes/localization.py index 936194d..f29d2e0 100644 --- a/server/routes/localization.py +++ b/server/routes/localization.py @@ -14,14 +14,18 @@ import os import tempfile -from fastapi import APIRouter, File, Form, UploadFile +from fastapi import APIRouter, Depends, File, Form, UploadFile from pydantic import BaseModel, Field +from core.glossary import to_translation_rules from core.langcode import LANGUAGES, to_underscore from core.meeting.summarizer import meeting_summarizer +from core.project import ProjectNotFoundError, ProjectStore from core.translation.service import translator from core.video.dubber import get_video_dubber +from .projects import get_default_project_store + router = APIRouter(prefix="/localization", tags=["localization"]) @@ -45,6 +49,9 @@ class TranslateRequest(BaseModel): target_lang: str = "zh-TW" # canonical 連字號 source_lang: str = "auto" glossary: str = "" + # F9-2i:選填課程關聯。給了 → 載入該課 glossary 的固定譯名(to_translation_rules) + # 併進 glossary 規則,讓在地化翻譯術語前後一致。沒給/查無 → 沿用現行行為(零影響)。 + project_id: str | None = None style: str = "" @@ -89,6 +96,60 @@ def _first(gen) -> str: return out +# ---------- F9-2i:課程 glossary → 翻譯固定譯名 ---------- +def _glossary_lang_candidates(code: str) -> list[str]: + """canonical 目標語言碼 → glossary translations key 候選(完整碼優先、再退基底子標籤)。 + + glossary 的逐語言譯名 key 用前端 LANGS 短碼(en/ja/ko/zh-CN/vi),但翻譯 route 對外收 + canonical BCP-47 區域碼(en-US/ja-JP/...)。先試完整碼(zh-CN/zh-TW 本就是 glossary key), + 再退基底(en-US → en)涵蓋「術語表用短碼登錄、route 收區域碼」的常見情形。 + """ + out = [code] + base = code.split("-", 1)[0] if code else "" + if base and base != code: + out.append(base) + return out + + +def _course_glossary_rules( + project_id: str | None, target_lang: str, store: ProjectStore +) -> str: + """載入該課 glossary → 該目標語言的 `to_translation_rules` 文字塊。 + + fail-soft(RFC §5):沒 project_id / 課不存在 / 無 glossary / 該語言無譯名 / 讀檔出錯 → + 一律回空字串,**絕不因為「想套術語」而讓翻譯失敗**。 + """ + if not project_id or not project_id.strip(): + return "" + try: + glossary = store.get_glossary(project_id) + except ProjectNotFoundError: + return "" + except Exception: # noqa: BLE001 — glossary 壞檔等任何問題都不該擋翻譯(fail-soft) + return "" + if glossary is None: + return "" + for lang in _glossary_lang_candidates(target_lang): + rules = to_translation_rules(glossary, lang) + if rules: + return rules + return "" + + +def _merge_glossary(caller_glossary: str, course_rules: str) -> str: + """合併呼叫端顯式 glossary 與該課 glossary 規則(都丟給 translate 的 glossary 參數)。 + + 呼叫端顯式規則放前面(較專一/手動覆寫優先呈現),課程規則接在後;任一為空就只留另一條, + 兩者皆空回空字串(translate 對空字串 no-op,行為與不傳 glossary 一致)。 + """ + parts = [] + if caller_glossary and caller_glossary.strip(): + parts.append(caller_glossary.strip()) + if course_rules: + parts.append(course_rules) + return "\n".join(parts) + + # ---------- 端點 ---------- @router.get("/languages") async def list_languages() -> dict: @@ -102,13 +163,24 @@ async def list_languages() -> dict: @router.post("/translate") -async def translate_text(req: TranslateRequest) -> dict: - """文字翻譯。對外 zh-TW,邊界轉 zh_TW 後送 Gemini 服務。""" +async def translate_text( + req: TranslateRequest, + store: ProjectStore = Depends(get_default_project_store), +) -> dict: + """文字翻譯。對外 zh-TW,邊界轉 zh_TW 後送 Gemini 服務。 + + F9-2i:給了 `project_id` → 載入該課 glossary 的固定譯名併進 glossary 規則(fail-soft)。 + """ + # 課程 glossary 讀檔(小型本機 JSON + RLock)也走 to_thread,沿 R-3 不阻 event loop。 + course_rules = await asyncio.to_thread( + _course_glossary_rules, req.project_id, req.target_lang, store + ) + glossary = _merge_glossary(req.glossary, course_rules) # R-3: 翻譯是 blocking (Gemini HTTP) → to_thread 不阻 event loop translated = await asyncio.to_thread( translator.translate, req.text, _u(req.source_lang), _u(req.target_lang), - glossary=req.glossary, style=req.style, + glossary=glossary, style=req.style, ) return { "translated_text": translated, diff --git a/tests/test_localization_glossary.py b/tests/test_localization_glossary.py new file mode 100644 index 0000000..e1920ca --- /dev/null +++ b/tests/test_localization_glossary.py @@ -0,0 +1,158 @@ +"""F9-2i:在地化翻譯 route 接課程 glossary 固定譯名測試。 + +驗收(對應 docs/JOB_COURSE_ASSOCIATION_RFC.md §4.3): +- `POST /localization/translate` 帶 `project_id` → 載入該課 glossary 的固定譯名 + (`to_translation_rules`)併進 glossary 規則送翻譯。 +- 呼叫端顯式 `glossary` 與課程規則合併(顯式在前、課程在後)。 +- canonical 區域碼(en-US)↔ glossary 短碼(en)對得上(候選碼退基底)。 +- fail-soft(RFC §5):沒 project_id / 課不存在 / 無 glossary / 該語言無譯名 → + 沿用現行行為(不傳課程規則、絕不讓翻譯失敗)。 + +Mock 策略:monkeypatch translator.translate 攔截實際送進去的 glossary 字串、不打真 +Gemini;ProjectStore 注入 tmp_path 隔離(全 offline-first)。 +""" +from __future__ import annotations + +import pytest + +pytest.importorskip("fastapi.testclient", reason="需要 fastapi 安裝") +pytest.importorskip("multipart", reason="server.main 內 upload route 需要") + +from fastapi.testclient import TestClient + +import core.translation.service as svc +import server.routes.projects as projects_mod +from core.glossary import Glossary, GlossaryEntry +from server.main import create_app + + +@pytest.fixture +def client(tmp_path, monkeypatch): + """TestClient + 隔離 ProjectStore + 攔截 translate 的 glossary 字串。""" + seen = {} + + def fake_translate(text, source_code, target_code, glossary="", style=""): + seen["glossary"] = glossary + seen["target"] = target_code + return "譯文" + + monkeypatch.setattr(svc.translator, "translate", fake_translate) + + app = create_app() + project_store = projects_mod.ProjectStore(root=tmp_path / "projects") + app.dependency_overrides[projects_mod.get_default_project_store] = lambda: project_store + with TestClient(app) as c: + yield c, project_store, seen + + +def _make_course_with_glossary(store, pid="course_statics", *, course="靜力學"): + """建一門課 + 一份含固定譯名的 glossary(en/ja)。""" + store.create(pid, title=course) + glossary = Glossary( + course=course, + entries=[ + GlossaryEntry( + term="自然頻率", + aliases=["ω_n", "wn"], + translations={"en": "natural frequency", "ja": "固有振動数"}, + ), + GlossaryEntry(term="阻尼比", translations={"en": "damping ratio"}), + ], + ) + store.save_glossary(pid, glossary) + return pid + + +class TestCourseGlossaryWiring: + def test_project_id_injects_translation_rules(self, client): + """帶 project_id → glossary 固定譯名(含別名並排)併進送翻譯的 glossary。""" + c, store, seen = client + pid = _make_course_with_glossary(store) + r = c.post("/localization/translate", json={ + "text": "自然頻率與阻尼比", "target_lang": "en-US", "project_id": pid, + }) + assert r.status_code == 200 + g = seen["glossary"] + # en-US 退基底 en → 對上 glossary 的 "en" 譯名 + assert "natural frequency" in g + assert "damping ratio" in g + # 來源面列出 term + 別名(longest-first,/ 並排) + assert "ω_n" in g and "自然頻率" in g + + def test_caller_glossary_merged_first(self, client): + """呼叫端顯式 glossary 與課程規則合併,顯式在前。""" + c, store, seen = client + pid = _make_course_with_glossary(store) + r = c.post("/localization/translate", json={ + "text": "x", "target_lang": "en-US", "project_id": pid, + "glossary": "手動規則 → manual", + }) + assert r.status_code == 200 + g = seen["glossary"] + assert "手動規則 → manual" in g + assert "natural frequency" in g + # 顯式規則排在課程規則之前 + assert g.index("手動規則") < g.index("natural frequency") + + def test_exact_region_code_matches(self, client): + """glossary key 用完整區域碼(zh-CN)時直接命中、不誤退基底。""" + c, store, seen = client + store.create("c2", title="材力") + store.save_glossary("c2", Glossary( + course="材力", + entries=[GlossaryEntry(term="應力", translations={"zh-CN": "应力"})], + )) + r = c.post("/localization/translate", json={ + "text": "應力", "target_lang": "zh-CN", "project_id": "c2", + }) + assert r.status_code == 200 + assert "应力" in seen["glossary"] + + def test_no_translation_for_lang_no_rules(self, client): + """該課 glossary 沒有目標語言譯名 → 不附課程規則(沿用空 glossary)。""" + c, store, seen = client + pid = _make_course_with_glossary(store) # 只有 en/ja + r = c.post("/localization/translate", json={ + "text": "x", "target_lang": "ko-KR", "project_id": pid, + }) + assert r.status_code == 200 + assert seen["glossary"] == "" + + +class TestFailSoft: + def test_no_project_id_passes_caller_glossary_only(self, client): + """沒 project_id → 只送呼叫端 glossary(現行行為,零影響)。""" + c, _store, seen = client + r = c.post("/localization/translate", json={ + "text": "x", "target_lang": "en-US", "glossary": "只有這條 → only", + }) + assert r.status_code == 200 + assert seen["glossary"] == "只有這條 → only" + + def test_unknown_project_id_fail_soft(self, client): + """project_id 指向不存在的課 → fail-soft 不報錯、不附課程規則。""" + c, _store, seen = client + r = c.post("/localization/translate", json={ + "text": "x", "target_lang": "en-US", "project_id": "nope", + }) + assert r.status_code == 200 + assert seen["glossary"] == "" + + def test_course_without_glossary_fail_soft(self, client): + """課存在但還沒建 glossary → fail-soft 回空課程規則。""" + c, store, seen = client + store.create("bare", title="尚無術語表") + r = c.post("/localization/translate", json={ + "text": "x", "target_lang": "en-US", "project_id": "bare", + }) + assert r.status_code == 200 + assert seen["glossary"] == "" + + def test_blank_project_id_treated_as_none(self, client): + """空白 project_id 視同未提供。""" + c, _store, seen = client + r = c.post("/localization/translate", json={ + "text": "x", "target_lang": "en-US", "project_id": " ", + }) + assert r.status_code == 200 + assert seen["glossary"] == ""