diff --git a/docs/PRODUCT_READINESS.md b/docs/PRODUCT_READINESS.md index 34bef6b..ffc8988 100644 --- a/docs/PRODUCT_READINESS.md +++ b/docs/PRODUCT_READINESS.md @@ -124,6 +124,17 @@ + `_validate_upload` 單元 + NFC)。全套 2410 passed。 - 註:`localization.py` 的 dub 上傳走 `tempfile`(OS 管理路徑,audit 評為低風險),本輪未動; 若要一併套白名單可開後續小 PR。 + - ✅ 2026-06-15 **後續小 PR 完成(localization 檔案端點上傳硬化)**。`localization.py` 的 5 個 + multipart 端點(`translate/image`、`translate/pdf`、`meeting/summarize`、`song/transcribe`、 + `dub`)原先只把上傳寫進 `tempfile`(無 path-traversal 風險,故 S-3 評低風險),但缺 `/upload` + 已有的「副檔名 + MIME 白名單」——任何人都能往這些端點塞非預期檔案。新增共用 + `_validate_media_upload()`:**副檔名為強 gate**(per 端點媒體類別:image=圖片副檔名、pdf=`.pdf`、 + meeting/song/dub=影音副檔名),**MIME 寬鬆輔助**(octet-stream/空字串放行=瀏覽器常見,只擋「有給 + 且明顯非該類」的大類,比照 S-4)。`dub` 走 url 來源不受影響(沒檔不驗)。檔案進 `mkstemp` 不用原 + 檔名,故只驗媒體類別、不另做檔名 sanitize(path 安全由 tempfile 保證)。補 + `tests/test_localization_upload.py` 16 測(強 gate 擋 .exe/文件塞影音端點/無副檔名 + MIME 寬鬆 + octet-stream/空放行·矛盾 MIME 擋下 + dub url 不驗·上傳驗 + 純函式單元,**全 mock 不打 API/不跑 + ffmpeg**)。本機全套 2691 passed(剩 1 QR 像素為容器缺 Noto CJK 字型假象,CI 權威)。 - [x] 🟡 **S-5 secret 落地強化**(GATE→已拍板)— ✅ 2026-06-07 完成。**劉老師拍板:不加密** (明文 + gitignore,自架單機可接受,Fernet 靜態加密過度設計、徒增金鑰管理負擔)。改為在 `SECURITY.md` 把機密檔處置講清楚:明文存放、**別放共享磁碟/雲端同步、別進未加密備份**(要備份 diff --git a/server/routes/localization.py b/server/routes/localization.py index f29d2e0..5c02e81 100644 --- a/server/routes/localization.py +++ b/server/routes/localization.py @@ -14,7 +14,7 @@ import os import tempfile -from fastapi import APIRouter, Depends, File, Form, UploadFile +from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile, status from pydantic import BaseModel, Field from core.glossary import to_translation_rules @@ -43,6 +43,49 @@ def _save_upload(upload: UploadFile, suffix: str = "") -> str: return path +# ---------- S-4(後續): 檔案端點上傳硬化 ---------- +# localization 的 multipart 端點原先只把上傳寫進 tempfile(OS 管理路徑、無 path-traversal +# 風險,S-3 評為低風險),但缺 uploads.py `/upload` 已有的「副檔名 + MIME 白名單」——任何人 +# 都能往 image/pdf/meeting/song/dub 端點塞非預期檔案。比照 S-4:副檔名為**強 gate**(per +# 端點媒體類別),MIME **寬鬆輔助**(瀏覽器常回 octet-stream / 空字串,不能硬擋,只擋「有給 +# 且明顯非該類」的大類)。檔案進 mkstemp 不用原檔名,故只驗媒體類別、不另做檔名 sanitize +# (path 安全由 tempfile 保證)。 +_IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif", ".tif", ".tiff"} +_PDF_EXTS = {".pdf"} +# 影片 / 音訊(meeting / song / dub 共用) +_AV_EXTS = { + ".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v", + ".mp3", ".wav", ".m4a", ".aac", ".flac", ".ogg", ".opus", +} +_PDF_MIME = frozenset({"application/pdf", "application/x-pdf", "application/acrobat"}) + + +def _validate_media_upload( + upload: UploadFile, + allowed_exts: set[str], + label: str, + *, + mime_prefixes: tuple[str, ...] = (), + mime_exact: frozenset[str] = frozenset(), +) -> None: + """S-4: 副檔名(強 gate)+ MIME(寬鬆輔助)白名單,擋掉非該類別的上傳檔。""" + ext = os.path.splitext(upload.filename or "")[1].lower() + if ext not in allowed_exts: + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + f"不接受的副檔名 {ext or '(無)'}; {label} 只收 {sorted(allowed_exts)}", + ) + ct = (upload.content_type or "").split(";")[0].strip().lower() + if not ct or ct == "application/octet-stream": + return # 瀏覽器常見、放行 + if ct in mime_exact or any(ct.startswith(p) for p in mime_prefixes): + return + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + f"不接受的 MIME 類型: {ct}({label})", + ) + + # ---------- 請求模型 ---------- class TranslateRequest(BaseModel): text: str @@ -234,6 +277,7 @@ async def translate_image( source_lang: str = Form("auto"), ) -> dict: """圖片 OCR + 翻譯(pytesseract,lazy)。回最終翻譯文字。""" + _validate_media_upload(file, _IMAGE_EXTS, "圖片", mime_prefixes=("image/",)) path = _save_upload(file) try: result = await asyncio.to_thread( @@ -253,6 +297,7 @@ async def translate_pdf( source_lang: str = Form("en-US"), ) -> dict: """PDF 逐頁翻譯(PyMuPDF,lazy)。回最終彙整文字。""" + _validate_media_upload(file, _PDF_EXTS, "PDF", mime_exact=_PDF_MIME) path = _save_upload(file, suffix=".pdf") try: result = await asyncio.to_thread( @@ -275,6 +320,7 @@ async def meeting_summarize( summary_types 以逗號分隔(如 'key_points,decisions')。 """ + _validate_media_upload(file, _AV_EXTS, "會議影音", mime_prefixes=("video/", "audio/")) path = _save_upload(file) types = [t.strip() for t in summary_types.split(",") if t.strip()] try: @@ -307,6 +353,7 @@ async def song_transcribe( """ from core.song_build import build_song_json_from_media + _validate_media_upload(file, _AV_EXTS, "歌曲影音", mime_prefixes=("video/", "audio/")) suffix = os.path.splitext(file.filename or "")[1] or ".mp3" path = _save_upload(file, suffix=suffix) try: @@ -338,6 +385,7 @@ async def dub_video( if not url: if file is None: return {"error": "需提供 url 或上傳 file"} + _validate_media_upload(file, _AV_EXTS, "配音影片", mime_prefixes=("video/", "audio/")) path = _save_upload(file, suffix=".mp4") source = url or path try: diff --git a/tests/test_localization_upload.py b/tests/test_localization_upload.py new file mode 100644 index 0000000..951efca --- /dev/null +++ b/tests/test_localization_upload.py @@ -0,0 +1,203 @@ +"""server.routes.localization 上傳硬化測試(S-4 後續:localization 檔案端點)。 + +驗收(比照 test_upload.py S-4 段): +- 5 個 multipart 端點(image/pdf/meeting/song/dub)都套**副檔名白名單**(強 gate)+ + **MIME 寬鬆白名單**(輔助)。 +- 合法媒體照常通過;明顯非該類別的檔案(如 .exe / 文件塞到影音端點)被擋 400。 +- MIME 寬鬆:octet-stream / 空字串放行;明顯不符的 MIME(image 配 av 端點)擋下。 +- dub 走 url 來源不受上傳硬化影響(沒有檔案就不驗)。 +全程 monkeypatch 媒體/翻譯模組,不打真 API、不跑 ffmpeg/whisper。 +""" +from __future__ import annotations + +import pytest + +pytest.importorskip("fastapi.testclient", reason="需要 fastapi 安裝") +pytest.importorskip("multipart", reason="server.main 內 upload route 需要") + +from fastapi.testclient import TestClient + +import core.translation.service as svc +import server.routes.localization as loc +from server.main import create_app + + +@pytest.fixture +def client(monkeypatch): + # 攔住所有會打真 media/Gemini 的點:合法上傳走到這些 fake 才不會踩外部依賴。 + monkeypatch.setattr( + svc.translator, "translate_image", + lambda path, t, s: iter(["IMG_OK"])) + monkeypatch.setattr( + svc.translator, "translate_pdf", + lambda path, t, s: iter(["PDF_OK"])) + app = create_app() + with TestClient(app) as c: + yield c + + +# ---------- 副檔名強 gate(擋非該類別檔案)---------- +class TestExtensionGate: + def test_image_rejects_non_image_ext(self, client): + r = client.post( + "/localization/translate/image", + files={"file": ("evil.exe", b"MZ", "application/octet-stream")}, + data={"target_lang": "zh-TW"}, + ) + assert r.status_code == 400 + assert "副檔名" in r.json()["detail"] + + def test_image_accepts_png(self, client): + r = client.post( + "/localization/translate/image", + files={"file": ("photo.png", b"\x89PNG", "image/png")}, + data={"target_lang": "zh-TW"}, + ) + assert r.status_code == 200 and r.json()["result"] == "IMG_OK" + + def test_pdf_rejects_non_pdf_ext(self, client): + r = client.post( + "/localization/translate/pdf", + files={"file": ("notes.txt", b"hi", "text/plain")}, + data={"target_lang": "zh-TW"}, + ) + assert r.status_code == 400 and "副檔名" in r.json()["detail"] + + def test_pdf_accepts_pdf(self, client): + r = client.post( + "/localization/translate/pdf", + files={"file": ("doc.pdf", b"%PDF-1.4", "application/pdf")}, + data={"target_lang": "zh-TW"}, + ) + assert r.status_code == 200 and r.json()["result"] == "PDF_OK" + + def test_meeting_rejects_document(self, client): + # 文件塞到影音端點 → 擋 + r = client.post( + "/localization/meeting/summarize", + files={"file": ("m.pdf", b"%PDF", "application/pdf")}, + data={"language": "zh-TW"}, + ) + assert r.status_code == 400 and "副檔名" in r.json()["detail"] + + def test_song_rejects_image(self, client): + r = client.post( + "/localization/song/transcribe", + files={"file": ("cover.png", b"\x89PNG", "image/png")}, + data={"song_title": "x"}, + ) + assert r.status_code == 400 and "副檔名" in r.json()["detail"] + + def test_missing_filename_rejected(self, client): + # 無副檔名(空 / 無 ext)一律不過強 gate + r = client.post( + "/localization/translate/image", + files={"file": ("noext", b"data", "image/png")}, + data={"target_lang": "zh-TW"}, + ) + assert r.status_code == 400 and "(無)" in r.json()["detail"] + + +# ---------- MIME 寬鬆輔助 ---------- +class TestMimeLenient: + def test_octet_stream_passes(self, client): + # 副檔名合法 + MIME 為瀏覽器常見 octet-stream → 放行 + r = client.post( + "/localization/translate/image", + files={"file": ("photo.jpg", b"\xff\xd8", "application/octet-stream")}, + data={"target_lang": "zh-TW"}, + ) + assert r.status_code == 200 + + def test_empty_mime_passes(self, client): + r = client.post( + "/localization/translate/image", + files={"file": ("photo.jpg", b"\xff\xd8", "")}, + data={"target_lang": "zh-TW"}, + ) + assert r.status_code == 200 + + def test_wrong_mime_for_ext_rejected(self, client): + # 副檔名 .jpg 但 MIME 明說是 video → 矛盾,擋下 + r = client.post( + "/localization/translate/image", + files={"file": ("photo.jpg", b"\xff\xd8", "video/mp4")}, + data={"target_lang": "zh-TW"}, + ) + assert r.status_code == 400 and "MIME" in r.json()["detail"] + + def test_av_endpoint_accepts_audio_mime(self, client, monkeypatch): + import core.meeting.summarizer as msum + from core.meeting.summarizer import MeetingSummaryResult + + monkeypatch.setattr( + msum.meeting_summarizer, "process_video", + lambda *a, **k: MeetingSummaryResult( + transcript="t", transcript_with_time="t", + summary={"full_summary": "s"}, duration=1.0, language="zh")) + r = client.post( + "/localization/meeting/summarize", + files={"file": ("rec.mp3", b"ID3", "audio/mpeg")}, + data={"language": "zh-TW"}, + ) + assert r.status_code == 200 and r.json()["summary"] == {"full_summary": "s"} + + +# ---------- dub:url 來源不受上傳硬化影響 ---------- +class TestDubSource: + def test_dub_url_skips_upload_validation(self, client, monkeypatch): + class _FakeDubber: + def process_video(self, source, src, tgt, burn_subtitles=False): + return {"dubbed_video": "/tmp/out.mp4"} + + monkeypatch.setattr(loc, "get_video_dubber", lambda: _FakeDubber()) + r = client.post("/localization/dub", data={ + "url": "https://youtu.be/x", "target_lang": "zh-TW", + }) + assert r.status_code == 200 + assert r.json()["results"]["dubbed_video"] == "/tmp/out.mp4" + + def test_dub_rejects_non_av_upload(self, client): + r = client.post( + "/localization/dub", + files={"file": ("x.exe", b"MZ", "application/octet-stream")}, + data={"target_lang": "zh-TW"}, + ) + assert r.status_code == 400 and "副檔名" in r.json()["detail"] + + def test_dub_accepts_mp4_upload(self, client, monkeypatch): + class _FakeDubber: + def process_video(self, source, src, tgt, burn_subtitles=False): + return {"dubbed_video": "/tmp/out.mp4"} + + monkeypatch.setattr(loc, "get_video_dubber", lambda: _FakeDubber()) + r = client.post( + "/localization/dub", + files={"file": ("clip.mp4", b"\x00\x00", "video/mp4")}, + data={"target_lang": "zh-TW"}, + ) + assert r.status_code == 200 + + +# ---------- 純函式單元 ---------- +class TestValidatorUnit: + def test_rejects_bad_ext(self): + from fastapi import HTTPException, UploadFile + import io + + up = UploadFile(filename="x.exe", file=io.BytesIO(b"")) + with pytest.raises(HTTPException) as ei: + loc._validate_media_upload(up, loc._AV_EXTS, "影音", + mime_prefixes=("video/", "audio/")) + assert ei.value.status_code == 400 + + def test_accepts_good_ext_and_prefix(self): + from fastapi import UploadFile + import io + + up = UploadFile( + filename="a.mp3", file=io.BytesIO(b""), + headers={"content-type": "audio/mpeg"}) + # 不應拋 + loc._validate_media_upload(up, loc._AV_EXTS, "影音", + mime_prefixes=("video/", "audio/"))