From c0d57a95547380b58531bb95534db8a90472481d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= <fatihburak@protonmail.com>
Date: Mon, 11 May 2026 20:20:50 +0300
Subject: [PATCH 01/15] feat: replace run summary cards with a terminal-style
 progress dashboard in the UI

---
 CHANGELOG.md                      |  6 +-
 ocr_pipeline/static/css/style.css | 94 ++++++++++++++++++++-----------
 ocr_pipeline/static/index.html    | 51 +++++++----------
 ocr_pipeline/static/js/app.js     | 36 ++++++++++++
 tests/test_ui_routes.py           | 15 +++++
 5 files changed, 137 insertions(+), 65 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 60d5daa..1f29e9f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,9 +2,10 @@
 
 All notable changes to OpenCR are documented here. The format follows [Keep a Changelog](https://keepachangelog.com/), and the project follows [Semantic Versioning](https://semver.org/).
 
-## [Unreleased]
+## [v1.0.0]
 
 ### Added
+
 - Apache-2.0 license (`LICENSE`).
 - English-first README with Turkish sibling at `README.tr.md`.
 - `CONTRIBUTING.md`, GitHub Actions CI workflow, project `Makefile`.
@@ -15,11 +16,13 @@ All notable changes to OpenCR are documented here. The format follows [Keep a Ch
 - Publish modal now prefills `username/run-name` and adds the `opencr` discoverability tag to dataset cards.
 
 ### Changed
+
 - **Breaking:** `docker compose up` no longer starts services without an explicit profile. Use `--profile gpu` (vLLM, NVIDIA) or `--profile cpu` (in-process transformers).
 - `INPUT_DIR` / `OUTPUT_DIR` default to `./input` / `./output` outside Docker, `/data/...` inside.
 - OpenAPI metadata now declares Apache-2.0; UI footer no longer claims "All rights reserved".
 
 ### Fixed
+
 - `.gitignore` now covers `.DS_Store`, IDE folders, lint caches, and HF caches.
 
 ---
@@ -34,6 +37,7 @@ All notable changes to OpenCR are documented here. The format follows [Keep a Ch
 6. GitHub auto-creates a release page from the tag; paste the changelog entry into it.
 
 Bump rules:
+
 - **PATCH** for bug fixes that don't change behavior.
 - **MINOR** for backwards-compatible features.
 - **MAJOR** for breaking changes (env var renames, removed endpoints, behavior shifts users have to adapt to).
diff --git a/ocr_pipeline/static/css/style.css b/ocr_pipeline/static/css/style.css
index 2912a34..310d65c 100644
--- a/ocr_pipeline/static/css/style.css
+++ b/ocr_pipeline/static/css/style.css
@@ -307,29 +307,74 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
 
 .run-detail { display: flex; flex-direction: column; }
 
-.run-summary {
-  display: grid;
-  grid-template-columns: repeat(4, 1fr);
+.run-terminal {
+  margin: 16px 24px;
+  padding: 14px 16px;
+  border-radius: 8px;
+  background: #111827;
+  color: #d1d5db;
+  font-family: var(--font-mono);
+}
+
+.run-terminal-head,
+.run-terminal-status {
+  display: flex;
+  align-items: center;
+}
+
+.run-terminal-head {
+  justify-content: space-between;
   gap: 12px;
-  padding: 16px 24px;
 }
-.summary-card {
-  padding: 12px 14px;
-  border: 1px solid var(--border);
-  border-radius: var(--radius-sm);
-  background: rgba(255, 255, 255, 0.6);
+
+.run-terminal-status { gap: 8px; min-width: 0; }
+.run-terminal-label { color: #f9fafb; font-weight: 700; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
+.run-terminal code { color: #93c5fd; font: inherit; font-weight: 700; white-space: nowrap; }
+
+.run-spinner {
+  width: 10px;
+  height: 10px;
+  border: 2px solid rgba(209, 213, 219, 0.28);
+  border-top-color: #34d399;
+  border-radius: 50%;
+  animation: run-spin 0.8s linear infinite;
 }
-.summary-label { font-size: 0.72rem; color: var(--muted); text-transform: uppercase; letter-spacing: 0.1em; }
-.summary-card strong { display: block; margin-top: 4px; font-size: 1.05rem; font-weight: 700; }
 
-.progress-track {
+@keyframes run-spin { to { transform: rotate(360deg); } }
+
+.run-terminal-bar {
   height: 6px;
-  margin-top: 10px;
+  margin: 12px 0;
   border-radius: 999px;
-  background: rgba(31, 109, 85, 0.12);
+  background: #1f2937;
   overflow: hidden;
 }
-.progress-fill { height: 100%; background: linear-gradient(90deg, #1f6d55, #2d8a6b); transition: width 0.25s ease; }
+
+.run-terminal-fill {
+  height: 100%;
+  border-radius: inherit;
+  background: #34d399;
+  transition: width 0.25s ease;
+}
+
+.run-terminal-body {
+  display: grid;
+  gap: 5px;
+  font-size: 0.82rem;
+}
+
+.run-terminal-body p {
+  display: flex;
+  gap: 10px;
+  margin: 0;
+  min-width: 0;
+}
+
+.terminal-key {
+  width: 72px;
+  flex: 0 0 72px;
+  color: #9ca3af;
+}
 
 .run-actions { display: flex; gap: 10px; padding: 0 24px 16px; }
 
@@ -352,24 +397,6 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
 .doc-name { font-weight: 600; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
 .doc-meta { display: flex; gap: 6px; align-items: center; margin-top: 4px; font-size: 0.82rem; color: var(--muted); }
 
-.heatmap {
-  display: flex;
-  flex-wrap: wrap;
-  gap: 2px;
-  margin-top: 10px;
-}
-.heat-cell {
-  width: 12px;
-  height: 12px;
-  border-radius: 2px;
-  background: rgba(115, 100, 82, 0.18);
-}
-.heat-cell.page-pass { background: var(--success); }
-.heat-cell.page-warn { background: var(--warn); }
-.heat-cell.page-fail { background: var(--error); }
-.heat-cell.page-empty { background: rgba(115, 100, 82, 0.5); }
-.heat-cell.page-pending { background: rgba(115, 100, 82, 0.18); }
-
 /* ---------------- inspector ---------------- */
 
 .inspector { padding: 0; }
@@ -530,7 +557,6 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
 
 @media (max-width: 720px) {
   .topbar { flex-direction: column; align-items: stretch; gap: 10px; }
-  .run-summary { grid-template-columns: repeat(2, 1fr); }
   .toast-container { right: 10px; left: 10px; }
 }
 
diff --git a/ocr_pipeline/static/index.html b/ocr_pipeline/static/index.html
index ef99aaa..97481db 100644
--- a/ocr_pipeline/static/index.html
+++ b/ocr_pipeline/static/index.html
@@ -176,28 +176,30 @@ <h2>Run <code x-text="selectedRun?.id"></code></h2>
         </div>
       </header>
 
-      <section class="run-summary">
-        <div class="summary-card">
-          <span class="summary-label">Progress</span>
-          <strong x-text="Math.round((selectedRun?.progress || 0) * 100) + '%'"></strong>
-          <div class="progress-track" x-show="selectedRun?.status === 'processing' || selectedRun?.status === 'completed'">
-            <div class="progress-fill" :style="`width:${(selectedRun?.progress || 0) * 100}%`"></div>
+      <section class="run-terminal" aria-label="Run extraction progress">
+        <div class="run-terminal-head">
+          <div class="run-terminal-status">
+            <span class="run-spinner" x-show="['queued', 'processing'].includes(selectedRun?.status)"></span>
+            <span class="run-terminal-label" x-text="selectedRun?.stage || selectedRun?.status || 'idle'"></span>
           </div>
+          <code x-text="runDocumentProgressLabel()"></code>
         </div>
-        <div class="summary-card">
-          <span class="summary-label">Pages</span>
-          <strong>
-            <span x-text="selectedRun?.pages_completed || 0"></span>
-            <span class="muted-note">/ <span x-text="selectedRun?.pages_total || 0"></span></span>
-          </strong>
+        <div class="run-terminal-bar">
+          <div class="run-terminal-fill" :style="`width:${runProgressPercent()}%`"></div>
         </div>
-        <div class="summary-card">
-          <span class="summary-label">Stage</span>
-          <strong x-text="selectedRun?.stage || '—'"></strong>
-        </div>
-        <div class="summary-card">
-          <span class="summary-label">Started</span>
-          <strong x-text="formatTimestamp(selectedRun?.started_at || selectedRun?.created_at)"></strong>
+        <div class="run-terminal-body">
+          <p>
+            <span class="terminal-key">document</span>
+            <span x-text="currentRunDocument()?.filename || 'waiting'"></span>
+          </p>
+          <p>
+            <span class="terminal-key">pages</span>
+            <span x-text="runPageProgressLabel()"></span>
+          </p>
+          <p>
+            <span class="terminal-key">stats</span>
+            <span x-text="runStatsLabel()"></span>
+          </p>
         </div>
       </section>
 
@@ -236,17 +238,6 @@ <h3 class="section-title">Documents</h3>
                 <span x-text="`${doc.pages_pass}P / ${doc.pages_warn}W / ${doc.pages_fail}F`"></span>
                 <span class="muted-note" x-show="doc.dominant_script"> · <span x-text="doc.dominant_script"></span></span>
               </div>
-              <div class="heatmap" x-show="doc.total_pages">
-                <template x-for="p in doc.total_pages" :key="p">
-                  <span class="heat-cell"
-                        :class="(() => {
-                          const pages = inspector.documentId === doc.document_id ? (inspector.document?.pages || []) : [];
-                          const found = pages.find(pp => pp.page_num === p);
-                          return found ? pageStatusClass(found.status) : 'page-pending';
-                        })()"
-                        :title="`Page ${p}`"></span>
-                </template>
-              </div>
             </li>
           </template>
         </ul>
diff --git a/ocr_pipeline/static/js/app.js b/ocr_pipeline/static/js/app.js
index 2481859..2dbe4d5 100644
--- a/ocr_pipeline/static/js/app.js
+++ b/ocr_pipeline/static/js/app.js
@@ -187,6 +187,42 @@ function opencrApp() {
       return (this.inspector.document?.pages || []).find(p => p.page_num === pageNum)?.status || 'pending';
     },
 
+    currentRunDocument() {
+      const docs = this.selectedRun?.documents || [];
+      if (docs.length === 0) return null;
+      return docs.find(d => d.status === 'processing')
+        || docs.find(d => ['pending', 'queued'].includes(d.status))
+        || docs.find(d => d.status === 'failed')
+        || docs[docs.length - 1];
+    },
+
+    currentRunDocumentIndex() {
+      const docs = this.selectedRun?.documents || [];
+      const current = this.currentRunDocument();
+      const index = current ? docs.findIndex(d => d.document_id === current.document_id) : -1;
+      return index === -1 ? 0 : index + 1;
+    },
+
+    runDocumentProgressLabel() {
+      const total = this.selectedRun?.documents_total || (this.selectedRun?.documents || []).length || 0;
+      return `[${this.currentRunDocumentIndex()}/${total}]`;
+    },
+
+    runProgressPercent() {
+      return Math.max(0, Math.min(100, Math.round((this.selectedRun?.progress || 0) * 100)));
+    },
+
+    runPageProgressLabel() {
+      return `${this.selectedRun?.pages_completed || 0}/${this.selectedRun?.pages_total || 0}`;
+    },
+
+    runStatsLabel() {
+      const docs = this.selectedRun?.documents || [];
+      const warn = docs.reduce((sum, d) => sum + (d.pages_warn || 0), 0);
+      const fail = docs.reduce((sum, d) => sum + (d.pages_fail || 0), 0);
+      return `${this.runProgressPercent()}% · ${warn} warn · ${fail} fail`;
+    },
+
     pageStatusClass(status) { return PAGE_STATUS[status] || 'page-pending'; },
     runStatusClass(status) { return STATUS_PILL[status] || 'pill-muted'; },
 
diff --git a/tests/test_ui_routes.py b/tests/test_ui_routes.py
index 1889184..d02f47f 100644
--- a/tests/test_ui_routes.py
+++ b/tests/test_ui_routes.py
@@ -1,4 +1,5 @@
 from fastapi.testclient import TestClient
+from pathlib import Path
 
 import ocr_pipeline.main as main_module
 from ocr_pipeline.config import settings
@@ -59,3 +60,17 @@ async def fake_wait_for_model_server():
         resp = client.get("/api/runs")
         assert resp.status_code == 200
         assert resp.json() == []
+
+
+def test_run_detail_uses_minimal_terminal_progress():
+    repo_root = Path(__file__).parents[1]
+    html = (repo_root / "ocr_pipeline/static/index.html").read_text(encoding="utf-8")
+    app_js = (repo_root / "ocr_pipeline/static/js/app.js").read_text(encoding="utf-8")
+
+    assert 'class="run-terminal"' in html
+    assert 'x-text="runDocumentProgressLabel()"' in html
+    assert 'class="run-spinner"' in html
+    assert 'class="summary-card"' not in html
+    assert 'class="heatmap"' not in html
+    assert "currentRunDocument()" in app_js
+    assert "runDocumentProgressLabel()" in app_js

From da34827287d46b34d381f1e39e550d8f08ea3898 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= <fatihburak@protonmail.com>
Date: Mon, 11 May 2026 20:35:47 +0300
Subject: [PATCH 02/15] feat: introduce document catalog and workbench for
 metadata management and PDF orchestration

---
 ocr_pipeline/main.py                      |   4 +-
 ocr_pipeline/models/schemas.py            |  41 ++++++
 ocr_pipeline/routers/documents.py         |  50 +++++++
 ocr_pipeline/routers/ui.py                |   3 +
 ocr_pipeline/services/dataset_exporter.py |  41 +++++-
 ocr_pipeline/services/db.py               | 136 +++++++++++++++++++
 ocr_pipeline/services/document_catalog.py |  36 +++++
 ocr_pipeline/services/run_orchestrator.py |  10 +-
 ocr_pipeline/static/css/style.css         |  87 ++++++++++++
 ocr_pipeline/static/index.html            | 155 +++++++++++++---------
 ocr_pipeline/static/js/api.js             |  19 +++
 ocr_pipeline/static/js/app.js             |  83 +++++++++++-
 tests/test_dataset_exporter.py            |  20 ++-
 tests/test_document_catalog.py            |  43 ++++++
 tests/test_ui_routes.py                   |  33 +++++
 15 files changed, 684 insertions(+), 77 deletions(-)
 create mode 100644 ocr_pipeline/routers/documents.py
 create mode 100644 ocr_pipeline/services/document_catalog.py
 create mode 100644 tests/test_document_catalog.py

diff --git a/ocr_pipeline/main.py b/ocr_pipeline/main.py
index 8a69d6f..0c91e17 100644
--- a/ocr_pipeline/main.py
+++ b/ocr_pipeline/main.py
@@ -9,7 +9,7 @@
 from starlette.middleware.sessions import SessionMiddleware
 
 from ocr_pipeline.config import settings
-from ocr_pipeline.routers import auth, health, extract, jobs, metrics, runs, ui
+from ocr_pipeline.routers import auth, documents, health, extract, jobs, metrics, runs, ui
 from ocr_pipeline.services.db import init_database
 from ocr_pipeline.services.run_orchestrator import init_orchestrator
 from ocr_pipeline.services.run_storage import RunStorage
@@ -71,7 +71,7 @@ async def lifespan(app: FastAPI):
     https_only=False,
 )
 
-for r in (health, extract, jobs, runs, metrics, ui, auth):
+for r in (health, extract, jobs, runs, documents, metrics, ui, auth):
     app.include_router(r.router)
 
 _static_dir = Path(__file__).parent / "static"
diff --git a/ocr_pipeline/models/schemas.py b/ocr_pipeline/models/schemas.py
index 635cc3f..575e963 100644
--- a/ocr_pipeline/models/schemas.py
+++ b/ocr_pipeline/models/schemas.py
@@ -68,6 +68,47 @@ class FileInfo(BaseModel):
     path: str
 
 
+class DocumentUpdate(BaseModel):
+    display_title: Optional[str] = None
+    author: Optional[str] = None
+    work: Optional[str] = None
+    book: Optional[str] = None
+    document_date_label: Optional[str] = None
+    document_date_precision: Optional[str] = None
+    language: Optional[str] = None
+    script: Optional[str] = None
+    license: Optional[str] = None
+    source_citation: Optional[str] = None
+    notes: Optional[str] = None
+    tags_json: Optional[str] = None
+
+
+class DocumentSummary(BaseModel):
+    id: str
+    filename: str
+    display_title: str
+    source_path: str
+    file_sha256: str
+    file_size_bytes: int
+    total_pages: Optional[int] = None
+    pdf_title: Optional[str] = None
+    pdf_author: Optional[str] = None
+    author: Optional[str] = None
+    work: Optional[str] = None
+    book: Optional[str] = None
+    document_date_label: Optional[str] = None
+    document_date_precision: Optional[str] = None
+    language: Optional[str] = None
+    script: Optional[str] = None
+    license: Optional[str] = None
+    source_citation: Optional[str] = None
+    notes: Optional[str] = None
+    tags_json: Optional[str] = None
+    metadata_complete: bool = False
+    latest_run_id: Optional[str] = None
+    latest_run_status: Optional[str] = None
+
+
 class StagedDocumentInfo(BaseModel):
     document_id: str
     filename: str
diff --git a/ocr_pipeline/routers/documents.py b/ocr_pipeline/routers/documents.py
new file mode 100644
index 0000000..f26c8c5
--- /dev/null
+++ b/ocr_pipeline/routers/documents.py
@@ -0,0 +1,50 @@
+from fastapi import APIRouter, HTTPException, Path as PathParam, Query
+
+from ocr_pipeline.models.schemas import DocumentSummary, DocumentUpdate, RunSummary
+from ocr_pipeline.routers.runs import _run_summary
+from ocr_pipeline.services.db import get_db
+
+
+router = APIRouter()
+
+ID = PathParam(..., pattern=r"^[A-Za-z0-9_\-]{1,64}$")
+
+
+def _document_summary(row: dict) -> DocumentSummary:
+    data = dict(row)
+    data["display_title"] = data.get("display_title") or data.get("pdf_title") or data["filename"]
+    data["metadata_complete"] = bool(data.get("metadata_complete"))
+    return DocumentSummary(**data)
+
+
+@router.get("/api/documents", response_model=list[DocumentSummary])
+async def list_documents(limit: int = Query(500, ge=1, le=1000)):
+    return [_document_summary(d) for d in await get_db().list_documents(limit=limit)]
+
+
+@router.get("/api/documents/{document_id}", response_model=DocumentSummary)
+async def get_document(document_id: str = ID):
+    doc = await get_db().get_document(document_id)
+    if not doc:
+        raise HTTPException(status_code=404, detail="Document not found")
+    listed = [d for d in await get_db().list_documents(limit=1000) if d["id"] == document_id]
+    return _document_summary(listed[0] if listed else doc)
+
+
+@router.patch("/api/documents/{document_id}", response_model=DocumentSummary)
+async def update_document(payload: DocumentUpdate, document_id: str = ID):
+    try:
+        await get_db().update_document_metadata(
+            document_id,
+            **payload.model_dump(exclude_unset=True),
+        )
+    except KeyError:
+        raise HTTPException(status_code=404, detail="Document not found")
+    return await get_document(document_id)
+
+
+@router.get("/api/documents/{document_id}/runs", response_model=list[RunSummary])
+async def list_document_runs(document_id: str = ID):
+    if not await get_db().get_document(document_id):
+        raise HTTPException(status_code=404, detail="Document not found")
+    return [_run_summary(r) for r in await get_db().list_document_runs(document_id)]
diff --git a/ocr_pipeline/routers/ui.py b/ocr_pipeline/routers/ui.py
index 451264e..2b9c19c 100644
--- a/ocr_pipeline/routers/ui.py
+++ b/ocr_pipeline/routers/ui.py
@@ -6,6 +6,8 @@
 
 from ocr_pipeline.config import settings
 from ocr_pipeline.models.schemas import FileInfo
+from ocr_pipeline.services.db import get_db
+from ocr_pipeline.services.document_catalog import catalog_pdf
 
 
 router = APIRouter()
@@ -25,6 +27,7 @@ async def upload_pdf(file: UploadFile):
     dest = settings.input_dir / safe_name
     content = await file.read()
     dest.write_bytes(content)
+    await catalog_pdf(get_db(), dest, filename=safe_name)
 
     return {"filename": safe_name, "size": len(content), "path": str(dest)}
 
diff --git a/ocr_pipeline/services/dataset_exporter.py b/ocr_pipeline/services/dataset_exporter.py
index 67394b4..fe7645e 100644
--- a/ocr_pipeline/services/dataset_exporter.py
+++ b/ocr_pipeline/services/dataset_exporter.py
@@ -1,7 +1,7 @@
 import hashlib
 import json
 import zipfile
-from dataclasses import asdict, dataclass
+from dataclasses import asdict, dataclass, field
 from pathlib import Path
 
 import pyarrow as pa
@@ -29,6 +29,7 @@ class DocumentExport:
     metadata: DocumentMetadata
     document_id: str
     artifact_paths: ArtifactPaths
+    catalog_metadata: dict = field(default_factory=dict)
 
 
 class DatasetExporter:
@@ -53,6 +54,14 @@ def _split_name(stable_key: str) -> str:
             return "validation"
         return "test"
 
+    @staticmethod
+    def _language_list(value) -> list[str]:
+        if isinstance(value, list):
+            return [str(v).strip() for v in value if str(v).strip()]
+        if not value:
+            return []
+        return [part.strip() for part in str(value).split(",") if part.strip()]
+
     def export_run(
         self,
         run_id: str,
@@ -66,6 +75,7 @@ def export_run(
 
         for entry in documents:
             doc_meta = entry.metadata
+            catalog = entry.catalog_metadata or {}
             paths = entry.artifact_paths
             raw_text = paths.raw_txt.read_text(encoding="utf-8") if paths.raw_txt.exists() else ""
             clean_text = paths.clean_txt.read_text(encoding="utf-8") if paths.clean_txt.exists() else ""
@@ -83,6 +93,16 @@ def export_run(
                         "run_id": run_id,
                         "document_id": entry.document_id,
                         "document_name": doc_meta.filename,
+                        "title": catalog.get("display_title") or catalog.get("title") or doc_meta.pdf_title,
+                        "author": catalog.get("author") or doc_meta.pdf_author,
+                        "work": catalog.get("work"),
+                        "book": catalog.get("book"),
+                        "document_date_label": catalog.get("document_date_label"),
+                        "document_date_precision": catalog.get("document_date_precision"),
+                        "language": self._language_list(catalog.get("language")) or page_meta.detected_languages,
+                        "script": catalog.get("script") or page_meta.primary_script,
+                        "license": catalog.get("license"),
+                        "source_citation": catalog.get("source_citation"),
                         "page_number": page_meta.page_num,
                         "source_pdf_sha256": doc_meta.file_sha256,
                         "raw_text": page_raw_text,
@@ -105,12 +125,23 @@ def export_run(
                 )
 
             document_rows.append(
-                {
+                    {
                     "dataset_export_id": export_id,
                     "run_id": run_id,
-                    "document_id": entry.document_id,
-                    "document_name": doc_meta.filename,
-                    "source_pdf_sha256": doc_meta.file_sha256,
+                        "document_id": entry.document_id,
+                        "document_name": doc_meta.filename,
+                        "title": catalog.get("display_title") or catalog.get("title") or doc_meta.pdf_title,
+                        "author": catalog.get("author") or doc_meta.pdf_author,
+                        "work": catalog.get("work"),
+                        "book": catalog.get("book"),
+                        "document_date_label": catalog.get("document_date_label"),
+                        "document_date_precision": catalog.get("document_date_precision"),
+                        "language": self._language_list(catalog.get("language")) or doc_meta.languages_detected,
+                        "script": catalog.get("script") or doc_meta.dominant_script,
+                        "license": catalog.get("license"),
+                        "source_citation": catalog.get("source_citation"),
+                        "notes": catalog.get("notes"),
+                        "source_pdf_sha256": doc_meta.file_sha256,
                     "page_count": doc_meta.total_pages,
                     "raw_text": raw_text,
                     "clean_text": clean_text,
diff --git a/ocr_pipeline/services/db.py b/ocr_pipeline/services/db.py
index 9b6d3eb..d88b671 100644
--- a/ocr_pipeline/services/db.py
+++ b/ocr_pipeline/services/db.py
@@ -10,6 +10,38 @@
 logger = logging.getLogger("ocr_pipeline.db")
 
 
+DOCUMENT_METADATA_FIELDS = {
+    "display_title",
+    "author",
+    "work",
+    "book",
+    "document_date_label",
+    "document_date_precision",
+    "language",
+    "script",
+    "license",
+    "source_citation",
+    "notes",
+    "tags_json",
+}
+
+DOCUMENT_METADATA_COLUMNS = {
+    "display_title": "TEXT",
+    "author": "TEXT",
+    "work": "TEXT",
+    "book": "TEXT",
+    "document_date_label": "TEXT",
+    "document_date_precision": "TEXT",
+    "language": "TEXT",
+    "script": "TEXT",
+    "license": "TEXT",
+    "source_citation": "TEXT",
+    "notes": "TEXT",
+    "tags_json": "TEXT",
+    "catalog_updated_at": "TEXT",
+}
+
+
 SCHEMA = """
 CREATE TABLE IF NOT EXISTS runs (
     id TEXT PRIMARY KEY,
@@ -43,6 +75,19 @@
     pdf_author TEXT,
     pdf_creation_date TEXT,
     pdf_producer TEXT,
+    display_title TEXT,
+    author TEXT,
+    work TEXT,
+    book TEXT,
+    document_date_label TEXT,
+    document_date_precision TEXT,
+    language TEXT,
+    script TEXT,
+    license TEXT,
+    source_citation TEXT,
+    notes TEXT,
+    tags_json TEXT,
+    catalog_updated_at TEXT,
     first_seen_at TEXT NOT NULL,
     last_seen_at TEXT NOT NULL
 );
@@ -139,6 +184,7 @@ async def connect(self) -> None:
         await self._conn.execute("PRAGMA journal_mode=WAL;")
         await self._conn.execute("PRAGMA foreign_keys=ON;")
         await self._conn.executescript(SCHEMA)
+        await self._migrate()
         await self._conn.commit()
         logger.info("Database ready at %s", self.db_path)
 
@@ -153,6 +199,14 @@ def conn(self) -> aiosqlite.Connection:
             raise RuntimeError("Database not connected; call connect() first.")
         return self._conn
 
+    async def _migrate(self) -> None:
+        """Apply additive migrations for existing local SQLite catalogs."""
+        async with self.conn.execute("PRAGMA table_info(documents)") as cur:
+            existing = {row["name"] for row in await cur.fetchall()}
+        for name, column_type in DOCUMENT_METADATA_COLUMNS.items():
+            if name not in existing:
+                await self.conn.execute(f"ALTER TABLE documents ADD COLUMN {name} {column_type}")
+
     @asynccontextmanager
     async def cursor(self) -> AsyncIterator[aiosqlite.Cursor]:
         async with self.conn.cursor() as cur:
@@ -269,6 +323,8 @@ async def upsert_document(
             ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
             ON CONFLICT(id) DO UPDATE SET
                 filename = excluded.filename,
+                source_path = excluded.source_path,
+                file_size_bytes = excluded.file_size_bytes,
                 last_seen_at = excluded.last_seen_at,
                 total_pages = COALESCE(excluded.total_pages, documents.total_pages),
                 pdf_title = COALESCE(excluded.pdf_title, documents.pdf_title),
@@ -300,6 +356,86 @@ async def get_document(self, document_id: str) -> Optional[dict[str, Any]]:
         ) as cur:
             return _row_to_dict(await cur.fetchone())
 
+    async def list_documents(self, limit: int = 500) -> list[dict[str, Any]]:
+        async with self.conn.execute(
+            """
+            SELECT d.id, d.filename, d.source_path, d.file_sha256, d.file_size_bytes,
+                   d.total_pages, d.pdf_title, d.pdf_author, d.pdf_creation_date, d.pdf_producer,
+                   d.author, d.work, d.book, d.document_date_label, d.document_date_precision,
+                   d.language, d.script, d.license, d.source_citation, d.notes, d.tags_json,
+                   d.catalog_updated_at, d.first_seen_at, d.last_seen_at,
+                   COALESCE(NULLIF(d.display_title, ''), NULLIF(d.pdf_title, ''), d.filename)
+                       AS display_title,
+                   CASE
+                     WHEN COALESCE(d.author, '') != ''
+                      AND COALESCE(d.work, '') != ''
+                      AND COALESCE(d.document_date_label, '') != ''
+                      AND COALESCE(d.document_date_precision, '') != ''
+                      AND COALESCE(d.language, '') != ''
+                      AND COALESCE(d.script, '') != ''
+                      AND COALESCE(d.license, '') != ''
+                     THEN 1 ELSE 0
+                   END AS metadata_complete,
+                   (
+                     SELECT r.id
+                     FROM run_documents rd
+                     JOIN runs r ON r.id = rd.run_id
+                     WHERE rd.document_id = d.id
+                     ORDER BY r.created_at DESC
+                     LIMIT 1
+                   ) AS latest_run_id,
+                   (
+                     SELECT r.status
+                     FROM run_documents rd
+                     JOIN runs r ON r.id = rd.run_id
+                     WHERE rd.document_id = d.id
+                     ORDER BY r.created_at DESC
+                     LIMIT 1
+                   ) AS latest_run_status
+            FROM documents d
+            ORDER BY d.last_seen_at DESC
+            LIMIT ?
+            """,
+            (limit,),
+        ) as cur:
+            rows = await cur.fetchall()
+            return [_row_to_dict(r) for r in rows]  # type: ignore[misc]
+
+    async def update_document_metadata(self, document_id: str, **fields: Any) -> dict[str, Any]:
+        clean = {k: v for k, v in fields.items() if k in DOCUMENT_METADATA_FIELDS}
+        if clean:
+            clean["catalog_updated_at"] = _now()
+            cols = ", ".join(f"{k} = ?" for k in clean)
+            values = [*clean.values(), document_id]
+            cur = await self.conn.execute(
+                f"UPDATE documents SET {cols} WHERE id = ?",
+                values,
+            )
+            await self.conn.commit()
+            affected = cur.rowcount or 0
+            await cur.close()
+            if not affected:
+                raise KeyError(document_id)
+
+        doc = await self.get_document(document_id)
+        if not doc:
+            raise KeyError(document_id)
+        return doc
+
+    async def list_document_runs(self, document_id: str) -> list[dict[str, Any]]:
+        async with self.conn.execute(
+            """
+            SELECT r.*, rd.status AS document_status, rd.pages_pass, rd.pages_warn, rd.pages_fail
+            FROM run_documents rd
+            JOIN runs r ON r.id = rd.run_id
+            WHERE rd.document_id = ?
+            ORDER BY r.created_at DESC
+            """,
+            (document_id,),
+        ) as cur:
+            rows = await cur.fetchall()
+            return [_row_to_dict(r) for r in rows]  # type: ignore[misc]
+
     async def get_document_by_sha(self, file_sha256: str) -> Optional[dict[str, Any]]:
         async with self.conn.execute(
             "SELECT * FROM documents WHERE file_sha256 = ?", (file_sha256,)
diff --git a/ocr_pipeline/services/document_catalog.py b/ocr_pipeline/services/document_catalog.py
new file mode 100644
index 0000000..6cc94a3
--- /dev/null
+++ b/ocr_pipeline/services/document_catalog.py
@@ -0,0 +1,36 @@
+import asyncio
+import hashlib
+from pathlib import Path
+
+import fitz
+
+from ocr_pipeline.services.db import Database
+
+
+def _hash_file_sync(path: Path) -> str:
+    digest = hashlib.sha256()
+    with path.open("rb") as handle:
+        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def _count_pages_sync(path: Path) -> int:
+    with fitz.open(str(path)) as doc:
+        return len(doc)
+
+
+async def catalog_pdf(db: Database, path: Path, *, filename: str | None = None) -> dict:
+    sha = await asyncio.to_thread(_hash_file_sync, path)
+    try:
+        page_count = await asyncio.to_thread(_count_pages_sync, path)
+    except Exception:
+        page_count = 0
+    return await db.upsert_document(
+        sha[:16],
+        filename=filename or path.name,
+        source_path=str(path),
+        file_sha256=sha,
+        file_size_bytes=(await asyncio.to_thread(path.stat)).st_size,
+        total_pages=page_count or None,
+    )
diff --git a/ocr_pipeline/services/run_orchestrator.py b/ocr_pipeline/services/run_orchestrator.py
index 5561e04..f3e02cf 100644
--- a/ocr_pipeline/services/run_orchestrator.py
+++ b/ocr_pipeline/services/run_orchestrator.py
@@ -227,8 +227,14 @@ async def _maybe_export(self, run_id: str, documents_meta: list, export_parquet:
             return None
         await self.db.update_run(run_id, stage="exporting")
         await self._emit(run_id, "dataset_export_started", {})
-        exports = [DocumentExport(metadata=m, document_id=did, artifact_paths=p)
-                   for (did, p, m) in documents_meta]
+        exports = []
+        for did, paths, meta in documents_meta:
+            exports.append(DocumentExport(
+                metadata=meta,
+                document_id=did,
+                artifact_paths=paths,
+                catalog_metadata=await self.db.get_document(did) or {},
+            ))
         result = await asyncio.to_thread(
             DatasetExporter(self.storage.dataset_dir(run_id)).export_run,
             run_id, exports,
diff --git a/ocr_pipeline/static/css/style.css b/ocr_pipeline/static/css/style.css
index 310d65c..eb90fa6 100644
--- a/ocr_pipeline/static/css/style.css
+++ b/ocr_pipeline/static/css/style.css
@@ -113,6 +113,8 @@ a { color: var(--accent); }
   min-height: calc(100vh - 100px);
 }
 
+.console-grid.document-mode { grid-template-columns: 280px minmax(0, 1fr); }
+
 .rail, .stage, .inspector {
   border: 1px solid var(--border);
   border-radius: var(--radius);
@@ -280,6 +282,86 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
 
 .intake-cta { display: flex; align-items: center; gap: 14px; margin-top: 6px; }
 
+/* ---------------- document workbench ---------------- */
+
+.document-workbench { display: flex; flex-direction: column; min-height: 100%; }
+
+.document-workbench-body {
+  display: grid;
+  grid-template-columns: minmax(0, 1fr) 340px;
+  gap: 14px;
+  padding: 16px 24px 24px;
+}
+
+.document-library,
+.document-editor {
+  border: 1px solid var(--border);
+  border-radius: var(--radius-sm);
+  background: rgba(255, 255, 255, 0.62);
+  overflow: hidden;
+}
+
+.document-toolbar {
+  display: flex;
+  align-items: center;
+  gap: 12px;
+  min-height: 48px;
+  padding: 8px 12px;
+  border-bottom: 1px solid var(--border);
+}
+.document-toolbar.drag-over { background: var(--accent-soft); }
+
+.document-table { display: grid; }
+.document-row {
+  display: grid;
+  grid-template-columns: 24px minmax(220px, 1fr) 86px 84px 78px;
+  align-items: center;
+  gap: 10px;
+  min-height: 52px;
+  padding: 8px 12px;
+  border-bottom: 1px solid var(--border);
+  cursor: pointer;
+}
+.document-row:last-child { border-bottom: none; }
+.document-row:hover { background: rgba(255, 255, 255, 0.72); }
+.document-row.active { background: var(--accent-soft); }
+.document-row-head {
+  min-height: 34px;
+  cursor: default;
+  background: rgba(115, 100, 82, 0.08);
+  color: var(--muted);
+  font-size: 0.72rem;
+  font-weight: 700;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+}
+.document-title { min-width: 0; display: grid; gap: 2px; }
+.document-title strong,
+.document-title span { overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
+.document-title span { color: var(--muted); font-size: 0.78rem; }
+
+.document-editor-head {
+  padding: 14px 16px;
+  border-bottom: 1px solid var(--border);
+}
+.document-editor-head .eyebrow {
+  margin: 0 0 4px;
+  text-transform: uppercase;
+  letter-spacing: 0.12em;
+  color: var(--muted);
+  font-size: 0.66rem;
+  font-weight: 700;
+}
+.document-editor-head h3 { margin: 0; font-size: 1rem; }
+.document-editor-fields { display: grid; gap: 10px; padding: 14px 16px; }
+.field-row { display: grid; grid-template-columns: 1fr 1fr; gap: 10px; }
+.document-editor-actions {
+  display: flex;
+  justify-content: flex-end;
+  padding: 12px 16px;
+  border-top: 1px solid var(--border);
+}
+
 /* ---------------- buttons ---------------- */
 
 .btn {
@@ -543,6 +625,7 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
 
 @media (max-width: 1280px) {
   .console-grid { grid-template-columns: 240px minmax(0, 1fr) 460px; }
+  .console-grid.document-mode { grid-template-columns: 240px minmax(0, 1fr); }
 }
 
 @media (max-width: 1080px) {
@@ -553,10 +636,14 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
   .rail { max-height: 320px; }
   .inspector { max-height: 720px; }
   .metric-strip { display: none; }
+  .document-workbench-body { grid-template-columns: 1fr; }
 }
 
 @media (max-width: 720px) {
   .topbar { flex-direction: column; align-items: stretch; gap: 10px; }
+  .document-row { grid-template-columns: 24px minmax(160px, 1fr) 72px 76px; }
+  .document-row > :last-child { display: none; }
+  .field-row { grid-template-columns: 1fr; }
   .toast-container { right: 10px; left: 10px; }
 }
 
diff --git a/ocr_pipeline/static/index.html b/ocr_pipeline/static/index.html
index 97481db..242915b 100644
--- a/ocr_pipeline/static/index.html
+++ b/ocr_pipeline/static/index.html
@@ -46,7 +46,7 @@ <h1>OpenCR</h1>
   </div>
 </header>
 
-<main class="console-grid">
+<main class="console-grid" :class="{ 'document-mode': !selectedRunId }">
 
   <!-- Left rail: Runs sidebar -->
   <aside class="rail">
@@ -87,78 +87,101 @@ <h2>Runs</h2>
   <!-- Center: Run detail or intake -->
   <section class="stage">
 
-    <!-- Intake panel when no run selected -->
-    <div class="intake" x-show="!selectedRunId">
+    <!-- Document workbench when no run selected -->
+    <div class="document-workbench" x-show="!selectedRunId">
       <header class="stage-header">
-        <h2>Start a new run</h2>
-        <p class="stage-sub">Drop PDFs, pick which to extract, and OpenCR will OCR, validate, and bundle them.</p>
+        <div>
+          <h2>Documents</h2>
+          <p class="stage-sub">Select PDFs, edit metadata, then run extraction.</p>
+        </div>
+        <div class="stage-actions">
+          <button class="btn btn-ghost btn-sm" @click="$refs.fileInput.click()">Upload PDFs</button>
+          <button class="btn btn-primary btn-sm"
+                  :disabled="selectedDocumentIds.length === 0 || creating"
+                  @click="startDocumentsRun">
+            <span x-show="!creating">Start run</span>
+            <span x-show="creating">Starting...</span>
+          </button>
+        </div>
       </header>
 
-      <div class="drop-zone"
-           :class="{ 'drag-over': dragOver }"
-           @dragover.prevent="dragOver = true"
-           @dragleave.prevent="dragOver = false"
-           @drop.prevent="handleDrop($event)"
-           @click="$refs.fileInput.click()">
-        <input type="file" accept=".pdf" multiple x-ref="fileInput" hidden @change="handleFileSelect($event)">
-        <p class="drop-title">Drop PDFs or click to browse</p>
-        <p class="drop-note">Files are deduplicated by content hash. Re-uploading the same PDF is free.</p>
-        <p class="drop-progress" x-show="uploadProgress !== null" x-text="'Uploading ' + uploadProgress + '%'"></p>
-      </div>
-
-      <div class="intake-toolbar">
-        <label class="checkbox-row">
-          <input type="checkbox" :checked="allInputsSelected" @change="selectAllInputs($event.target.checked)">
-          <span>Select all</span>
-        </label>
-        <span class="muted-note">
-          <span x-text="selectedPaths.length"></span> of <span x-text="inputFiles.length"></span> selected
-        </span>
-        <button class="btn btn-ghost btn-sm" @click="refreshInputFiles">Refresh inputs</button>
-      </div>
+      <input type="file" accept=".pdf" multiple x-ref="fileInput" hidden @change="handleFileSelect($event)">
 
-      <ul class="file-list" x-show="inputFiles.length > 0">
-        <template x-for="file in inputFiles" :key="file.path">
-          <li class="file-item">
-            <input type="checkbox"
-                   :checked="selectedPaths.includes(file.path)"
-                   @change="toggleSelected(file.path)">
-            <span class="file-name" x-text="file.name"></span>
-            <span class="file-size" x-text="formatSize(file.size)"></span>
-          </li>
-        </template>
-      </ul>
-      <p class="empty-note" x-show="inputFiles.length === 0">No PDFs in input directory. Upload above.</p>
-
-      <div class="intake-options">
-        <label class="field">
-          <span>Run name (optional)</span>
-          <input type="text" placeholder="e.g. arabic-corpus-batch-3" x-model="intakeOptions.name">
-        </label>
-        <label class="option-toggle">
-          <input type="checkbox" x-model="intakeOptions.stripRefs">
-          <div>
-            <strong>Strip reference blocks</strong>
-            <p>Remove DeepSeek bounding-box markup from cleaned text.</p>
+      <div class="document-workbench-body">
+        <section class="document-library">
+          <div class="document-toolbar"
+               :class="{ 'drag-over': dragOver }"
+               @dragover.prevent="dragOver = true"
+               @dragleave.prevent="dragOver = false"
+               @drop.prevent="handleDrop($event)">
+            <label class="checkbox-row">
+              <input type="checkbox" :checked="allDocumentsSelected" @change="selectAllDocuments($event.target.checked)">
+              <span><span x-text="selectedDocumentIds.length"></span> selected</span>
+            </label>
+            <button class="btn btn-ghost btn-sm" @click="refreshDocuments">Refresh</button>
+            <span class="muted-note" x-show="uploadProgress !== null" x-text="'Uploading ' + uploadProgress + '%'"></span>
           </div>
-        </label>
-        <label class="option-toggle">
-          <input type="checkbox" x-model="intakeOptions.exportParquet">
-          <div>
-            <strong>Export Parquet bundle</strong>
-            <p>Build train/validation/test-split Parquet for HuggingFace.</p>
+
+          <div class="document-table" x-show="documents.length > 0">
+            <div class="document-row document-row-head">
+              <span></span>
+              <span>Document</span>
+              <span>Date</span>
+              <span>Metadata</span>
+              <span>Last run</span>
+            </div>
+            <template x-for="doc in documents" :key="doc.id">
+              <div class="document-row"
+                   :class="{ active: selectedDocumentId === doc.id }"
+                   @click="selectDocument(doc.id)">
+                <input type="checkbox"
+                       :checked="selectedDocumentIds.includes(doc.id)"
+                       @click.stop
+                       @change="toggleDocument(doc.id)">
+                <div class="document-title">
+                  <strong x-text="doc.display_title || doc.filename"></strong>
+                  <span x-text="doc.filename"></span>
+                </div>
+                <span x-text="doc.document_date_label || '—'"></span>
+                <span class="pill pill-sm"
+                      :class="doc.metadata_complete ? 'pill-success' : 'pill-warn'"
+                      x-text="doc.metadata_complete ? 'ready' : 'missing'"></span>
+                <span x-text="doc.latest_run_status || 'never'"></span>
+              </div>
+            </template>
           </div>
-        </label>
-      </div>
+          <p class="empty-note" x-show="documents.length === 0">No PDFs yet. Upload PDFs to start the catalog.</p>
+        </section>
 
-      <div class="intake-cta">
-        <button class="btn btn-primary"
-                :disabled="selectedPaths.length === 0 || creating"
-                @click="startNewRun">
-          <span x-show="!creating">Start OCR run</span>
-          <span x-show="creating">Starting...</span>
-        </button>
-        <p class="muted-note">Pages process concurrently — large PDFs land fast.</p>
+        <aside class="document-editor" x-show="selectedDocumentId">
+          <header class="document-editor-head">
+            <p class="eyebrow">Document metadata</p>
+            <h3 x-text="documentDraft.display_title || documentDraft.filename"></h3>
+          </header>
+          <div class="document-editor-fields">
+            <label class="field"><span>Title</span><input type="text" x-model="documentDraft.display_title"></label>
+            <label class="field"><span>Author</span><input type="text" x-model="documentDraft.author"></label>
+            <label class="field"><span>Work</span><input type="text" x-model="documentDraft.work"></label>
+            <label class="field"><span>Book</span><input type="text" x-model="documentDraft.book"></label>
+            <div class="field-row">
+              <label class="field"><span>Date</span><input type="text" placeholder="1923 or 1900s" x-model="documentDraft.document_date_label"></label>
+              <label class="field"><span>Precision</span><input type="text" placeholder="exact/century" x-model="documentDraft.document_date_precision"></label>
+            </div>
+            <div class="field-row">
+              <label class="field"><span>Language</span><input type="text" placeholder="ota-Latn,tr" x-model="documentDraft.language"></label>
+              <label class="field"><span>Script</span><input type="text" placeholder="latin_extended" x-model="documentDraft.script"></label>
+            </div>
+            <label class="field"><span>License</span><input type="text" placeholder="cc-by-4.0" x-model="documentDraft.license"></label>
+            <label class="field"><span>Source / citation</span><input type="text" x-model="documentDraft.source_citation"></label>
+            <label class="field"><span>Notes</span><input type="text" x-model="documentDraft.notes"></label>
+          </div>
+          <footer class="document-editor-actions">
+            <button class="btn btn-primary" :disabled="savingDocument" @click="saveSelectedDocument">
+              <span x-show="!savingDocument">Save metadata</span>
+              <span x-show="savingDocument">Saving...</span>
+            </button>
+          </footer>
+        </aside>
       </div>
     </div>
 
diff --git a/ocr_pipeline/static/js/api.js b/ocr_pipeline/static/js/api.js
index 9b3d33a..32bcbf5 100644
--- a/ocr_pipeline/static/js/api.js
+++ b/ocr_pipeline/static/js/api.js
@@ -44,6 +44,25 @@ const API = {
     return res.json();
   },
 
+  async listDocuments(limit = 500) {
+    const res = await fetch(`/api/documents?limit=${limit}`);
+    if (!res.ok) throw new Error('Failed to list documents');
+    return res.json();
+  },
+
+  async updateDocument(documentId, payload) {
+    const res = await fetch(`/api/documents/${encodeURIComponent(documentId)}`, {
+      method: 'PATCH',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify(payload),
+    });
+    if (!res.ok) {
+      const data = await res.json().catch(() => ({}));
+      throw new Error(data.detail || 'Failed to update document');
+    }
+    return res.json();
+  },
+
   async createRun(filePaths, { name, stripRefs = false, exportParquet = true } = {}) {
     const res = await fetch('/api/runs', {
       method: 'POST',
diff --git a/ocr_pipeline/static/js/app.js b/ocr_pipeline/static/js/app.js
index 2dbe4d5..df122f9 100644
--- a/ocr_pipeline/static/js/app.js
+++ b/ocr_pipeline/static/js/app.js
@@ -32,6 +32,12 @@ function opencrApp() {
     selectedRunId: null,
     selectedRun: null,
 
+    documents: [],
+    selectedDocumentIds: [],
+    selectedDocumentId: null,
+    documentDraft: {},
+    savingDocument: false,
+
     inputFiles: [],
     selectedPaths: [],
     intakeOptions: { stripRefs: false, exportParquet: true, name: '' },
@@ -50,6 +56,7 @@ function opencrApp() {
         this.refreshHealth(),
         this.refreshMetrics(),
         this.refreshInputFiles(),
+        this.refreshDocuments(),
         this.refreshRuns(),
         this.refreshAuth(),
       ]);
@@ -100,6 +107,15 @@ function opencrApp() {
       catch (e) { this.toast(`Failed to load inputs: ${e.message}`, 'error'); }
     },
 
+    async refreshDocuments() {
+      try {
+        this.documents = await API.listDocuments();
+        if (!this.selectedDocumentId && this.documents.length > 0) this.selectDocument(this.documents[0].id);
+      } catch (e) {
+        this.toast(`Failed to load documents: ${e.message}`, 'error');
+      }
+    },
+
     async refreshRuns() {
       try { this.runs = await API.listRuns(50); }
       catch (e) { this.toast(`Failed to load runs: ${e.message}`, 'error'); }
@@ -226,6 +242,70 @@ function opencrApp() {
     pageStatusClass(status) { return PAGE_STATUS[status] || 'page-pending'; },
     runStatusClass(status) { return STATUS_PILL[status] || 'pill-muted'; },
 
+    selectedDocument() {
+      return this.documents.find(d => d.id === this.selectedDocumentId) || null;
+    },
+
+    selectDocument(documentId) {
+      this.selectedDocumentId = documentId;
+      const doc = this.selectedDocument();
+      this.documentDraft = doc ? { ...doc } : {};
+    },
+
+    toggleDocument(documentId) {
+      const i = this.selectedDocumentIds.indexOf(documentId);
+      if (i === -1) this.selectedDocumentIds.push(documentId);
+      else this.selectedDocumentIds.splice(i, 1);
+    },
+
+    selectAllDocuments(checked) {
+      this.selectedDocumentIds = checked ? this.documents.map(d => d.id) : [];
+    },
+
+    get allDocumentsSelected() {
+      return this.documents.length > 0 && this.selectedDocumentIds.length === this.documents.length;
+    },
+
+    selectedDocumentPaths() {
+      return this.documents
+        .filter(d => this.selectedDocumentIds.includes(d.id))
+        .map(d => d.source_path);
+    },
+
+    async saveSelectedDocument() {
+      if (!this.selectedDocumentId || this.savingDocument) return;
+      this.savingDocument = true;
+      try {
+        await API.updateDocument(this.selectedDocumentId, {
+          display_title: this.documentDraft.display_title || null,
+          author: this.documentDraft.author || null,
+          work: this.documentDraft.work || null,
+          book: this.documentDraft.book || null,
+          document_date_label: this.documentDraft.document_date_label || null,
+          document_date_precision: this.documentDraft.document_date_precision || null,
+          language: this.documentDraft.language || null,
+          script: this.documentDraft.script || null,
+          license: this.documentDraft.license || null,
+          source_citation: this.documentDraft.source_citation || null,
+          notes: this.documentDraft.notes || null,
+        });
+        await this.refreshDocuments();
+        this.selectDocument(this.selectedDocumentId);
+        this.toast('Document metadata saved', 'success');
+      } catch (e) {
+        this.toast(`Metadata save failed: ${e.message}`, 'error');
+      } finally {
+        this.savingDocument = false;
+      }
+    },
+
+    async startDocumentsRun() {
+      const paths = this.selectedDocumentPaths();
+      if (paths.length === 0) return this.toast('Select documents first', 'error');
+      this.selectedPaths = paths;
+      await this.startNewRun();
+    },
+
     async startNewRun() {
       if (this.selectedPaths.length === 0 || this.creating) return;
       this.creating = true;
@@ -239,6 +319,7 @@ function opencrApp() {
         if (dedup > 0) this.toast(`${dedup} document(s) recognized from prior runs`, 'info');
         this.toast(`Run ${result.run_id} queued`, 'success');
         this.selectedPaths = [];
+        this.selectedDocumentIds = [];
         await this.refreshRuns();
         await this.selectRun(result.run_id);
       } catch (e) {
@@ -301,7 +382,7 @@ function opencrApp() {
       this.uploadProgress = null;
       if (count > 0) {
         this.toast(`Uploaded ${count} file(s)`, 'success');
-        await this.refreshInputFiles();
+        await Promise.all([this.refreshInputFiles(), this.refreshDocuments()]);
       }
     },
 
diff --git a/tests/test_dataset_exporter.py b/tests/test_dataset_exporter.py
index 7c012a6..f542725 100644
--- a/tests/test_dataset_exporter.py
+++ b/tests/test_dataset_exporter.py
@@ -26,7 +26,21 @@ def test_export_run_writes_parquet_manifest_and_bundle(tmp_path):
     )
 
     exporter = DatasetExporter(storage.dataset_dir("run-1234"))
-    export = DocumentExport(metadata=document, document_id=document_id, artifact_paths=paths)
+    export = DocumentExport(
+        metadata=document,
+        document_id=document_id,
+        artifact_paths=paths,
+        catalog_metadata={
+            "author": "Evliyâ Çelebi",
+            "work": "Seyahatnâme",
+            "book": "1",
+            "document_date_label": "1900s",
+            "document_date_precision": "century",
+            "language": "ota-Latn,tr",
+            "script": "latin_extended",
+            "license": "cc-by-4.0",
+        },
+    )
     result = exporter.export_run("run-1234", [export])
 
     assert result.pages_parquet.exists()
@@ -40,7 +54,11 @@ def test_export_run_writes_parquet_manifest_and_bundle(tmp_path):
     assert documents_table.num_rows == 1
     assert "raw_text" in pages_table.column_names
     assert "clean_text" in pages_table.column_names
+    assert "author" in pages_table.column_names
+    assert pages_table.column("work").to_pylist() == ["Seyahatnâme", "Seyahatnâme"]
+    assert pages_table.column("document_date_precision").to_pylist() == ["century", "century"]
     assert "split" in documents_table.column_names
+    assert documents_table.column("author").to_pylist() == ["Evliyâ Çelebi"]
 
     manifest = json.loads(result.manifest.read_text(encoding="utf-8"))
     assert manifest["run_id"] == "run-1234"
diff --git a/tests/test_document_catalog.py b/tests/test_document_catalog.py
new file mode 100644
index 0000000..f7343a4
--- /dev/null
+++ b/tests/test_document_catalog.py
@@ -0,0 +1,43 @@
+import asyncio
+
+from ocr_pipeline.services.db import Database
+
+
+def test_document_metadata_can_be_updated_and_listed(tmp_path):
+    async def _scenario():
+        db = Database(tmp_path / "opencr.sqlite")
+        await db.connect()
+        try:
+            await db.upsert_document(
+                "doc-1",
+                filename="source.pdf",
+                source_path="/tmp/source.pdf",
+                file_sha256="abc",
+                file_size_bytes=123,
+                total_pages=4,
+            )
+
+            updated = await db.update_document_metadata(
+                "doc-1",
+                author="Evliyâ Çelebi",
+                work="Seyahatnâme",
+                book="1",
+                document_date_label="1900s",
+                document_date_precision="century",
+                language="ota-Latn,tr",
+                script="latin_extended",
+                license="cc-by-4.0",
+            )
+
+            assert updated["author"] == "Evliyâ Çelebi"
+            assert updated["document_date_label"] == "1900s"
+            assert updated["document_date_precision"] == "century"
+
+            docs = await db.list_documents()
+            assert docs[0]["id"] == "doc-1"
+            assert docs[0]["display_title"] == "source.pdf"
+            assert docs[0]["metadata_complete"] == 1
+        finally:
+            await db.close()
+
+    asyncio.run(_scenario())
diff --git a/tests/test_ui_routes.py b/tests/test_ui_routes.py
index d02f47f..7a24e66 100644
--- a/tests/test_ui_routes.py
+++ b/tests/test_ui_routes.py
@@ -40,6 +40,27 @@ async def fake_wait_for_model_server():
         items = listing.json()
         assert any(item["name"] == "sample.pdf" for item in items)
 
+        documents = client.get("/api/documents")
+        assert documents.status_code == 200
+        docs = documents.json()
+        assert docs[0]["filename"] == "sample.pdf"
+
+        patch = client.patch(
+            f"/api/documents/{docs[0]['id']}",
+            json={
+                "author": "Evliyâ Çelebi",
+                "work": "Seyahatnâme",
+                "book": "1",
+                "document_date_label": "1900s",
+                "document_date_precision": "century",
+                "language": "ota-Latn,tr",
+                "script": "latin_extended",
+                "license": "cc-by-4.0",
+            },
+        )
+        assert patch.status_code == 200
+        assert patch.json()["metadata_complete"] is True
+
 
 def test_runs_list_empty_when_no_runs(tmp_path, monkeypatch):
     output_dir = tmp_path / "output"
@@ -74,3 +95,15 @@ def test_run_detail_uses_minimal_terminal_progress():
     assert 'class="heatmap"' not in html
     assert "currentRunDocument()" in app_js
     assert "runDocumentProgressLabel()" in app_js
+
+
+def test_home_uses_document_workbench():
+    repo_root = Path(__file__).parents[1]
+    html = (repo_root / "ocr_pipeline/static/index.html").read_text(encoding="utf-8")
+    app_js = (repo_root / "ocr_pipeline/static/js/app.js").read_text(encoding="utf-8")
+
+    assert 'class="document-workbench"' in html
+    assert "document_date_label" in html
+    assert "document_date_precision" in html
+    assert "selectedDocumentIds" in app_js
+    assert "saveSelectedDocument()" in app_js

From 90831844749a56d6370e767fd75b04ea86c661f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= <fatihburak@protonmail.com>
Date: Mon, 11 May 2026 20:59:56 +0300
Subject: [PATCH 03/15] feat: enhance dataset exporter with page-level IDs,
 content hashing, and project metadata inclusion

---
 Makefile                                    |  2 +-
 ocr_pipeline/main.py                        | 29 +++++--
 ocr_pipeline/models/schemas.py              | 39 +++++++--
 ocr_pipeline/routers/documents.py           |  8 +-
 ocr_pipeline/routers/ui.py                  | 15 ++--
 ocr_pipeline/services/dataset_exporter.py   | 87 +++++++++++++++-----
 ocr_pipeline/services/db.py                 | 32 ++++++--
 ocr_pipeline/services/local_ocr_engine.py   | 16 +++-
 ocr_pipeline/services/metadata_collector.py |  9 +-
 ocr_pipeline/services/output_validator.py   | 14 ++--
 ocr_pipeline/services/run_orchestrator.py   | 91 ++++++++++++++-------
 ocr_pipeline/services/script_detector.py    | 17 ++--
 ocr_pipeline/static/css/style.css           | 73 ++++++++++++++---
 ocr_pipeline/static/index.html              | 56 ++++++++-----
 ocr_pipeline/static/js/app.js               | 19 +++++
 requirements-dev.txt                        |  3 +
 scripts/run_batch.py                        | 12 ++-
 tests/test_dataset_exporter.py              | 16 +++-
 tests/test_document_catalog.py              |  2 +
 tests/test_ui_routes.py                     | 14 +++-
 20 files changed, 410 insertions(+), 144 deletions(-)
 create mode 100644 requirements-dev.txt

diff --git a/Makefile b/Makefile
index 7455fb2..e73afcb 100644
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,7 @@ $(VENV):
 
 install: $(VENV)
 	$(PIP) install -r ocr_pipeline/requirements.txt -r requirements-local.txt
-	$(PIP) install pytest pytest-asyncio ruff
+	$(PIP) install -r requirements-dev.txt
 
 run: $(VENV)
 	MODEL_BACKEND=local $(PYBIN) -m uvicorn ocr_pipeline.main:app --host 0.0.0.0 --port 39672 --reload
diff --git a/ocr_pipeline/main.py b/ocr_pipeline/main.py
index 0c91e17..76177f8 100644
--- a/ocr_pipeline/main.py
+++ b/ocr_pipeline/main.py
@@ -9,7 +9,16 @@
 from starlette.middleware.sessions import SessionMiddleware
 
 from ocr_pipeline.config import settings
-from ocr_pipeline.routers import auth, documents, health, extract, jobs, metrics, runs, ui
+from ocr_pipeline.routers import (
+    auth,
+    documents,
+    health,
+    extract,
+    jobs,
+    metrics,
+    runs,
+    ui,
+)
 from ocr_pipeline.services.db import init_database
 from ocr_pipeline.services.run_orchestrator import init_orchestrator
 from ocr_pipeline.services.run_storage import RunStorage
@@ -25,7 +34,9 @@
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     logger.info("OpenCR v%s starting (cdli.ai)", settings.pipeline_version)
-    logger.info("Model server: %s | Model: %s", settings.model_server_url, settings.model_name)
+    logger.info(
+        "Model server: %s | Model: %s", settings.model_server_url, settings.model_name
+    )
 
     db = init_database(settings.db_path)
     await db.connect()
@@ -39,7 +50,9 @@ async def lifespan(app: FastAPI):
     if await wait_for_model_server():
         logger.info("Pipeline ready to accept requests.")
     else:
-        logger.warning("Model server not ready — extraction requests will 503 until it is available.")
+        logger.warning(
+            "Model server not ready — extraction requests will 503 until it is available."
+        )
 
     yield
 
@@ -55,7 +68,10 @@ async def lifespan(app: FastAPI):
     ),
     version=settings.pipeline_version,
     contact={"name": "cdli.ai", "url": "https://cdli.ai"},
-    license_info={"name": "Apache-2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0"},
+    license_info={
+        "name": "Apache-2.0",
+        "url": "https://www.apache.org/licenses/LICENSE-2.0",
+    },
     lifespan=lifespan,
 )
 
@@ -86,4 +102,7 @@ async def serve_index():
 
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run("ocr_pipeline.main:app", host=settings.host, port=settings.port, reload=False)
+
+    uvicorn.run(
+        "ocr_pipeline.main:app", host=settings.host, port=settings.port, reload=False
+    )
diff --git a/ocr_pipeline/models/schemas.py b/ocr_pipeline/models/schemas.py
index 575e963..39da55a 100644
--- a/ocr_pipeline/models/schemas.py
+++ b/ocr_pipeline/models/schemas.py
@@ -1,17 +1,25 @@
 from pydantic import BaseModel, Field
-from typing import Any, Optional
+from typing import Optional
 
 
 class ExtractRequest(BaseModel):
     """Single PDF extraction request."""
+
     file_path: str = Field(description="Path to the PDF file")
-    output_dir: Optional[str] = Field(None, description="Output directory override (deprecated)")
-    strip_refs: bool = Field(False, description="Strip model reference blocks from output")
-    export_parquet: bool = Field(False, description="Export trainable Parquet artifacts")
+    output_dir: Optional[str] = Field(
+        None, description="Output directory override (deprecated)"
+    )
+    strip_refs: bool = Field(
+        False, description="Strip model reference blocks from output"
+    )
+    export_parquet: bool = Field(
+        False, description="Export trainable Parquet artifacts"
+    )
 
 
 class ExtractResponse(BaseModel):
     """Single PDF extraction response."""
+
     run_id: str
     document_id: str
     filename: str
@@ -30,10 +38,17 @@ class ExtractResponse(BaseModel):
 
 class JobRequest(BaseModel):
     """Batch extraction job request (compatibility wrapper around runs)."""
+
     file_paths: list[str] = Field(description="List of PDF file paths to process")
-    output_dir: Optional[str] = Field(None, description="Output directory override (deprecated)")
-    strip_refs: bool = Field(False, description="Strip model reference blocks (bounding boxes) from output")
-    export_parquet: bool = Field(False, description="Export a trainable Parquet bundle for the job")
+    output_dir: Optional[str] = Field(
+        None, description="Output directory override (deprecated)"
+    )
+    strip_refs: bool = Field(
+        False, description="Strip model reference blocks (bounding boxes) from output"
+    )
+    export_parquet: bool = Field(
+        False, description="Export a trainable Parquet bundle for the job"
+    )
     name: Optional[str] = Field(None, description="Optional human-friendly name")
 
 
@@ -70,6 +85,7 @@ class FileInfo(BaseModel):
 
 class DocumentUpdate(BaseModel):
     display_title: Optional[str] = None
+    group_path: Optional[str] = None
     author: Optional[str] = None
     work: Optional[str] = None
     book: Optional[str] = None
@@ -87,6 +103,7 @@ class DocumentSummary(BaseModel):
     id: str
     filename: str
     display_title: str
+    group_path: Optional[str] = None
     source_path: str
     file_sha256: str
     file_size_bytes: int
@@ -206,9 +223,13 @@ class RunDocumentDetail(RunDocumentSummary):
 
 
 class HFPublishRequest(BaseModel):
-    repo_id: str = Field(description="HuggingFace dataset repo (e.g. user/my-ocr-dataset)")
+    repo_id: str = Field(
+        description="HuggingFace dataset repo (e.g. user/my-ocr-dataset)"
+    )
     private: bool = False
-    token: Optional[str] = Field(None, description="HF token; if absent, uses HF_TOKEN env")
+    token: Optional[str] = Field(
+        None, description="HF token; if absent, uses HF_TOKEN env"
+    )
     commit_message: Optional[str] = None
 
 
diff --git a/ocr_pipeline/routers/documents.py b/ocr_pipeline/routers/documents.py
index f26c8c5..a94e878 100644
--- a/ocr_pipeline/routers/documents.py
+++ b/ocr_pipeline/routers/documents.py
@@ -12,7 +12,9 @@
 
 def _document_summary(row: dict) -> DocumentSummary:
     data = dict(row)
-    data["display_title"] = data.get("display_title") or data.get("pdf_title") or data["filename"]
+    data["display_title"] = (
+        data.get("display_title") or data.get("pdf_title") or data["filename"]
+    )
     data["metadata_complete"] = bool(data.get("metadata_complete"))
     return DocumentSummary(**data)
 
@@ -27,7 +29,9 @@ async def get_document(document_id: str = ID):
     doc = await get_db().get_document(document_id)
     if not doc:
         raise HTTPException(status_code=404, detail="Document not found")
-    listed = [d for d in await get_db().list_documents(limit=1000) if d["id"] == document_id]
+    listed = [
+        d for d in await get_db().list_documents(limit=1000) if d["id"] == document_id
+    ]
     return _document_summary(listed[0] if listed else doc)
 
 
diff --git a/ocr_pipeline/routers/ui.py b/ocr_pipeline/routers/ui.py
index 2b9c19c..a66da74 100644
--- a/ocr_pipeline/routers/ui.py
+++ b/ocr_pipeline/routers/ui.py
@@ -1,5 +1,6 @@
 """Static-friendly endpoints for input file management. Output/dataset listing
 moved to /api/runs."""
+
 from pathlib import Path
 
 from fastapi import APIRouter, HTTPException, UploadFile
@@ -43,10 +44,12 @@ async def list_input_files():
     for p in sorted(input_dir.iterdir()):
         if p.is_file() and p.suffix.lower() == ".pdf":
             stat = p.stat()
-            files.append(FileInfo(
-                name=p.name,
-                size=stat.st_size,
-                modified=stat.st_mtime,
-                path=str(p),
-            ))
+            files.append(
+                FileInfo(
+                    name=p.name,
+                    size=stat.st_size,
+                    modified=stat.st_mtime,
+                    path=str(p),
+                )
+            )
     return files
diff --git a/ocr_pipeline/services/dataset_exporter.py b/ocr_pipeline/services/dataset_exporter.py
index fe7645e..66db49b 100644
--- a/ocr_pipeline/services/dataset_exporter.py
+++ b/ocr_pipeline/services/dataset_exporter.py
@@ -12,6 +12,14 @@
 from ocr_pipeline.services.run_storage import ArtifactPaths
 
 
+PROJECT_METADATA = {
+    "project": "opencr",
+    "generator": "OpenCR",
+    "organization": "cdli.ai",
+    "organization_url": "https://cdli.ai",
+}
+
+
 @dataclass
 class DatasetExportResult:
     export_id: str
@@ -47,7 +55,9 @@ def _split_pages(text: str, total_pages: int) -> list[str]:
 
     @staticmethod
     def _split_name(stable_key: str) -> str:
-        bucket = int(hashlib.sha256(stable_key.encode("utf-8")).hexdigest()[:8], 16) % 100
+        bucket = (
+            int(hashlib.sha256(stable_key.encode("utf-8")).hexdigest()[:8], 16) % 100
+        )
         if bucket < 90:
             return "train"
         if bucket < 95:
@@ -62,6 +72,10 @@ def _language_list(value) -> list[str]:
             return []
         return [part.strip() for part in str(value).split(",") if part.strip()]
 
+    @staticmethod
+    def _text_sha256(text: str) -> str:
+        return hashlib.sha256(text.encode("utf-8")).hexdigest()
+
     def export_run(
         self,
         run_id: str,
@@ -77,9 +91,21 @@ def export_run(
             doc_meta = entry.metadata
             catalog = entry.catalog_metadata or {}
             paths = entry.artifact_paths
-            raw_text = paths.raw_txt.read_text(encoding="utf-8") if paths.raw_txt.exists() else ""
-            clean_text = paths.clean_txt.read_text(encoding="utf-8") if paths.clean_txt.exists() else ""
-            markdown_text = paths.markdown.read_text(encoding="utf-8") if paths.markdown.exists() else ""
+            raw_text = (
+                paths.raw_txt.read_text(encoding="utf-8")
+                if paths.raw_txt.exists()
+                else ""
+            )
+            clean_text = (
+                paths.clean_txt.read_text(encoding="utf-8")
+                if paths.clean_txt.exists()
+                else ""
+            )
+            markdown_text = (
+                paths.markdown.read_text(encoding="utf-8")
+                if paths.markdown.exists()
+                else ""
+            )
             raw_pages = self._split_pages(raw_text, doc_meta.total_pages)
             clean_pages = self._split_pages(clean_text, doc_meta.total_pages)
             split = self._split_name(doc_meta.file_sha256)
@@ -87,19 +113,27 @@ def export_run(
             for page_meta, page_raw_text, page_clean_text in zip(
                 doc_meta.pages, raw_pages, clean_pages
             ):
+                page_id = f"{entry.document_id}_page_{page_meta.page_num:04d}"
                 page_rows.append(
                     {
                         "dataset_export_id": export_id,
                         "run_id": run_id,
+                        "page_id": page_id,
                         "document_id": entry.document_id,
                         "document_name": doc_meta.filename,
-                        "title": catalog.get("display_title") or catalog.get("title") or doc_meta.pdf_title,
+                        "title": catalog.get("display_title")
+                        or catalog.get("title")
+                        or doc_meta.pdf_title,
+                        "group_path": catalog.get("group_path"),
                         "author": catalog.get("author") or doc_meta.pdf_author,
                         "work": catalog.get("work"),
                         "book": catalog.get("book"),
                         "document_date_label": catalog.get("document_date_label"),
-                        "document_date_precision": catalog.get("document_date_precision"),
-                        "language": self._language_list(catalog.get("language")) or page_meta.detected_languages,
+                        "document_date_precision": catalog.get(
+                            "document_date_precision"
+                        ),
+                        "language": self._language_list(catalog.get("language"))
+                        or page_meta.detected_languages,
                         "script": catalog.get("script") or page_meta.primary_script,
                         "license": catalog.get("license"),
                         "source_citation": catalog.get("source_citation"),
@@ -107,6 +141,8 @@ def export_run(
                         "source_pdf_sha256": doc_meta.file_sha256,
                         "raw_text": page_raw_text,
                         "clean_text": page_clean_text,
+                        "raw_text_sha256": self._text_sha256(page_raw_text),
+                        "clean_text_sha256": self._text_sha256(page_clean_text),
                         "validation_status": page_meta.validation_status,
                         "validation_issues": page_meta.validation_issues,
                         "script_direction": page_meta.script_direction,
@@ -125,26 +161,32 @@ def export_run(
                 )
 
             document_rows.append(
-                    {
+                {
                     "dataset_export_id": export_id,
                     "run_id": run_id,
-                        "document_id": entry.document_id,
-                        "document_name": doc_meta.filename,
-                        "title": catalog.get("display_title") or catalog.get("title") or doc_meta.pdf_title,
-                        "author": catalog.get("author") or doc_meta.pdf_author,
-                        "work": catalog.get("work"),
-                        "book": catalog.get("book"),
-                        "document_date_label": catalog.get("document_date_label"),
-                        "document_date_precision": catalog.get("document_date_precision"),
-                        "language": self._language_list(catalog.get("language")) or doc_meta.languages_detected,
-                        "script": catalog.get("script") or doc_meta.dominant_script,
-                        "license": catalog.get("license"),
-                        "source_citation": catalog.get("source_citation"),
-                        "notes": catalog.get("notes"),
-                        "source_pdf_sha256": doc_meta.file_sha256,
+                    "document_id": entry.document_id,
+                    "document_name": doc_meta.filename,
+                    "title": catalog.get("display_title")
+                    or catalog.get("title")
+                    or doc_meta.pdf_title,
+                    "group_path": catalog.get("group_path"),
+                    "author": catalog.get("author") or doc_meta.pdf_author,
+                    "work": catalog.get("work"),
+                    "book": catalog.get("book"),
+                    "document_date_label": catalog.get("document_date_label"),
+                    "document_date_precision": catalog.get("document_date_precision"),
+                    "language": self._language_list(catalog.get("language"))
+                    or doc_meta.languages_detected,
+                    "script": catalog.get("script") or doc_meta.dominant_script,
+                    "license": catalog.get("license"),
+                    "source_citation": catalog.get("source_citation"),
+                    "notes": catalog.get("notes"),
+                    "source_pdf_sha256": doc_meta.file_sha256,
                     "page_count": doc_meta.total_pages,
                     "raw_text": raw_text,
                     "clean_text": clean_text,
+                    "raw_text_sha256": self._text_sha256(raw_text),
+                    "clean_text_sha256": self._text_sha256(clean_text),
                     "markdown": markdown_text,
                     "pages_pass": doc_meta.pages_pass,
                     "pages_warn": doc_meta.pages_warn,
@@ -171,6 +213,7 @@ def export_run(
         manifest_payload = {
             "export_id": export_id,
             "run_id": run_id,
+            "created_by": PROJECT_METADATA,
             "documents_count": len(document_rows),
             "pages_count": len(page_rows),
             "artifacts": {
diff --git a/ocr_pipeline/services/db.py b/ocr_pipeline/services/db.py
index d88b671..90dccc4 100644
--- a/ocr_pipeline/services/db.py
+++ b/ocr_pipeline/services/db.py
@@ -12,6 +12,7 @@
 
 DOCUMENT_METADATA_FIELDS = {
     "display_title",
+    "group_path",
     "author",
     "work",
     "book",
@@ -27,6 +28,7 @@
 
 DOCUMENT_METADATA_COLUMNS = {
     "display_title": "TEXT",
+    "group_path": "TEXT",
     "author": "TEXT",
     "work": "TEXT",
     "book": "TEXT",
@@ -76,6 +78,7 @@
     pdf_creation_date TEXT,
     pdf_producer TEXT,
     display_title TEXT,
+    group_path TEXT,
     author TEXT,
     work TEXT,
     book TEXT,
@@ -205,7 +208,9 @@ async def _migrate(self) -> None:
             existing = {row["name"] for row in await cur.fetchall()}
         for name, column_type in DOCUMENT_METADATA_COLUMNS.items():
             if name not in existing:
-                await self.conn.execute(f"ALTER TABLE documents ADD COLUMN {name} {column_type}")
+                await self.conn.execute(
+                    f"ALTER TABLE documents ADD COLUMN {name} {column_type}"
+                )
 
     @asynccontextmanager
     async def cursor(self) -> AsyncIterator[aiosqlite.Cursor]:
@@ -263,7 +268,9 @@ async def update_run(self, run_id: str, **fields: Any) -> None:
         await self.conn.commit()
 
     async def get_run(self, run_id: str) -> Optional[dict[str, Any]]:
-        async with self.conn.execute("SELECT * FROM runs WHERE id = ?", (run_id,)) as cur:
+        async with self.conn.execute(
+            "SELECT * FROM runs WHERE id = ?", (run_id,)
+        ) as cur:
             row = await cur.fetchone()
             return _row_to_dict(row)
 
@@ -361,7 +368,7 @@ async def list_documents(self, limit: int = 500) -> list[dict[str, Any]]:
             """
             SELECT d.id, d.filename, d.source_path, d.file_sha256, d.file_size_bytes,
                    d.total_pages, d.pdf_title, d.pdf_author, d.pdf_creation_date, d.pdf_producer,
-                   d.author, d.work, d.book, d.document_date_label, d.document_date_precision,
+                   d.group_path, d.author, d.work, d.book, d.document_date_label, d.document_date_precision,
                    d.language, d.script, d.license, d.source_citation, d.notes, d.tags_json,
                    d.catalog_updated_at, d.first_seen_at, d.last_seen_at,
                    COALESCE(NULLIF(d.display_title, ''), NULLIF(d.pdf_title, ''), d.filename)
@@ -401,7 +408,9 @@ async def list_documents(self, limit: int = 500) -> list[dict[str, Any]]:
             rows = await cur.fetchall()
             return [_row_to_dict(r) for r in rows]  # type: ignore[misc]
 
-    async def update_document_metadata(self, document_id: str, **fields: Any) -> dict[str, Any]:
+    async def update_document_metadata(
+        self, document_id: str, **fields: Any
+    ) -> dict[str, Any]:
         clean = {k: v for k, v in fields.items() if k in DOCUMENT_METADATA_FIELDS}
         if clean:
             clean["catalog_updated_at"] = _now()
@@ -464,7 +473,9 @@ async def link_run_document(
         )
         await self.conn.commit()
 
-    async def update_run_document(self, run_id: str, document_id: str, **fields: Any) -> None:
+    async def update_run_document(
+        self, run_id: str, document_id: str, **fields: Any
+    ) -> None:
         if not fields:
             return
         cols = ", ".join(f"{k} = ?" for k in fields)
@@ -567,11 +578,18 @@ async def list_pages(self, run_id: str, document_id: str) -> list[dict[str, Any]
 
     # ---------- events ----------
 
-    async def append_event(self, run_id: str, event_type: str, payload: dict[str, Any]) -> int:
+    async def append_event(
+        self, run_id: str, event_type: str, payload: dict[str, Any]
+    ) -> int:
         now = _now()
         cur = await self.conn.execute(
             "INSERT INTO run_events (run_id, event_type, payload, created_at) VALUES (?, ?, ?, ?)",
-            (run_id, event_type, json.dumps(payload, ensure_ascii=False, default=str), now),
+            (
+                run_id,
+                event_type,
+                json.dumps(payload, ensure_ascii=False, default=str),
+                now,
+            ),
         )
         await self.conn.commit()
         last_id = cur.lastrowid or 0
diff --git a/ocr_pipeline/services/local_ocr_engine.py b/ocr_pipeline/services/local_ocr_engine.py
index 2305b1b..b9816e9 100644
--- a/ocr_pipeline/services/local_ocr_engine.py
+++ b/ocr_pipeline/services/local_ocr_engine.py
@@ -14,11 +14,13 @@
   imported when this module is instantiated. Install them via
   `requirements-local.txt`.
 """
+
 from __future__ import annotations
 
 import asyncio
 import logging
 import tempfile
+from importlib.util import find_spec
 from pathlib import Path
 from typing import Any
 
@@ -56,6 +58,7 @@ def _resolve_device(requested: str) -> str:
 
 def _resolve_dtype(requested: str, device: str):
     import torch
+
     if requested == "float16":
         return torch.float16
     if requested == "bfloat16":
@@ -105,8 +108,12 @@ async def _ensure_loaded(self) -> None:
             await asyncio.to_thread(self._load_blocking)
 
     def _load_blocking(self) -> None:
+        if find_spec("torch") is None:
+            raise RuntimeError(
+                "MODEL_BACKEND=local requires `transformers` and `torch`. "
+                "Install with: pip install -r requirements-local.txt"
+            )
         try:
-            import torch
             from transformers import AutoModel, AutoTokenizer
         except ImportError as exc:
             raise RuntimeError(
@@ -118,7 +125,9 @@ def _load_blocking(self) -> None:
         dtype = _resolve_dtype(settings.local_dtype, device)
         logger.info(
             "Loading %s on %s (%s). First boot downloads ~6 GB.",
-            self.model_name, device, dtype,
+            self.model_name,
+            device,
+            dtype,
         )
 
         # eager attention works everywhere; flash-attn-2 is CUDA-only and would
@@ -126,7 +135,8 @@ def _load_blocking(self) -> None:
         attn_impl = "flash_attention_2" if device == "cuda" else "eager"
 
         tokenizer = AutoTokenizer.from_pretrained(
-            self.model_name, trust_remote_code=True,
+            self.model_name,
+            trust_remote_code=True,
             cache_dir=str(settings.local_model_cache),
         )
         model = AutoModel.from_pretrained(
diff --git a/ocr_pipeline/services/metadata_collector.py b/ocr_pipeline/services/metadata_collector.py
index 859dd14..ec1231d 100644
--- a/ocr_pipeline/services/metadata_collector.py
+++ b/ocr_pipeline/services/metadata_collector.py
@@ -3,7 +3,7 @@
 import fitz
 import tiktoken
 
-from ocr_pipeline.models.metadata import PageMetadata, DocumentMetadata
+from ocr_pipeline.models.metadata import PageMetadata
 from ocr_pipeline.services.script_detector import ScriptAnalysis
 from ocr_pipeline.services.output_validator import ValidationResult
 from ocr_pipeline.services.page_analyzer import PageProfile
@@ -23,7 +23,7 @@ def count_tokens(self, text: str) -> int:
         """Count tokens using tiktoken cl100k_base encoding."""
         if self._tokenizer:
             return len(self._tokenizer.encode(text))
-        return int(len(text.split()) * 1.3)# Fallback: rough estimate
+        return int(len(text.split()) * 1.3)  # Fallback: rough estimate
 
     def build_page_metadata(
         self,
@@ -38,7 +38,7 @@ def build_page_metadata(
         page_profile: PageProfile,
     ) -> PageMetadata:
         words = text.split()
-        lines = [l for l in text.split("\n") if l.strip()]
+        lines = [line for line in text.split("\n") if line.strip()]
         token_count = self.count_tokens(text)
 
         return PageMetadata(
@@ -64,7 +64,8 @@ def build_page_metadata(
             validation_issues=validation_result.issues,
             repetition_ratio=validation_result.metrics.get("repetition_ratio", 0),
             has_embedded_text=page_profile.has_embedded_text,
-            is_image_only=not page_profile.has_embedded_text and page_profile.has_images,
+            is_image_only=not page_profile.has_embedded_text
+            and page_profile.has_images,
             page_width=page_profile.width,
             page_height=page_profile.height,
             image_count=page_profile.image_count,
diff --git a/ocr_pipeline/services/output_validator.py b/ocr_pipeline/services/output_validator.py
index dd0b1e1..5d25d59 100644
--- a/ocr_pipeline/services/output_validator.py
+++ b/ocr_pipeline/services/output_validator.py
@@ -25,12 +25,12 @@ class OutputValidator:
     """
 
     # --- Thresholds — tune based on your corpus --- #
-    MAX_REPETITION_RATIO = 0.35                      # If >35% of lines are duplicates -> WARN
-    MAX_REPETITION_RATIO_FAIL = 0.60                 # If >60% -> FAIL
-    MIN_UNIQUE_CHARS = 10                            # Minimum unique characters for non-empty
-    MAX_SINGLE_CHAR_RATIO = 0.50                     # If >50% of text is one character -> FAIL
-    MIN_TEXT_LENGTH = 20                             # Below this = probably blank page
-    MAX_CONSECUTIVE_DUPES = 5                        # 5+ identical consecutive lines = WARN
+    MAX_REPETITION_RATIO = 0.35  # If >35% of lines are duplicates -> WARN
+    MAX_REPETITION_RATIO_FAIL = 0.60  # If >60% -> FAIL
+    MIN_UNIQUE_CHARS = 10  # Minimum unique characters for non-empty
+    MAX_SINGLE_CHAR_RATIO = 0.50  # If >50% of text is one character -> FAIL
+    MIN_TEXT_LENGTH = 20  # Below this = probably blank page
+    MAX_CONSECUTIVE_DUPES = 5  # 5+ identical consecutive lines = WARN
 
     def validate(self, text: str, page_num: int) -> ValidationResult:
         issues: list[str] = []
@@ -46,7 +46,7 @@ def validate(self, text: str, page_num: int) -> ValidationResult:
             )
 
         # Check 2: Line-level repetition
-        lines = [l.strip() for l in stripped.split("\n") if l.strip()]
+        lines = [line.strip() for line in stripped.split("\n") if line.strip()]
         single_char_ratio = 0.0
 
         if lines:
diff --git a/ocr_pipeline/services/run_orchestrator.py b/ocr_pipeline/services/run_orchestrator.py
index f3e02cf..1c09f9d 100644
--- a/ocr_pipeline/services/run_orchestrator.py
+++ b/ocr_pipeline/services/run_orchestrator.py
@@ -134,11 +134,15 @@ async def create_run(
             await self.db.link_run_document(run_id, s.document_id, status="pending")
 
         observability.job_created()
-        return CreateRunResult(run_id=run_id, documents=staged, pages_total_estimate=pages_total)
+        return CreateRunResult(
+            run_id=run_id, documents=staged, pages_total_estimate=pages_total
+        )
 
     # ---------- execution ----------
 
-    def start(self, result: CreateRunResult, *, strip_refs: bool, export_parquet: bool) -> asyncio.Task:
+    def start(
+        self, result: CreateRunResult, *, strip_refs: bool, export_parquet: bool
+    ) -> asyncio.Task:
         task = asyncio.create_task(
             self._run(result, strip_refs=strip_refs, export_parquet=export_parquet)
         )
@@ -155,7 +159,9 @@ async def _run(
     ) -> None:
         run_id = result.run_id
         started_at = _now()
-        await self.db.update_run(run_id, status="processing", stage="ocr", started_at=started_at)
+        await self.db.update_run(
+            run_id, status="processing", stage="ocr", started_at=started_at
+        )
         await self._emit(run_id, "run_started", {"started_at": started_at})
 
         pages_total = result.pages_total_estimate
@@ -169,7 +175,9 @@ async def page_event(event: dict) -> None:
                 pages_completed += 1
                 progress = (pages_completed / pages_total) if pages_total else 0
                 await self.db.update_run(
-                    run_id, pages_completed=pages_completed, progress=min(0.99, progress),
+                    run_id,
+                    pages_completed=pages_completed,
+                    progress=min(0.99, progress),
                 )
                 observability.page_completed(
                     processing_time_ms=event.get("processing_time_ms", 0.0),
@@ -182,8 +190,12 @@ async def page_event(event: dict) -> None:
 
         try:
             for staged in result.documents:
-                paths = self.storage.artifact_paths(run_id, staged.document_id, staged.filename)
-                processor = BatchProcessor(self.db, event_callback=page_event, strip_refs=strip_refs)
+                paths = self.storage.artifact_paths(
+                    run_id, staged.document_id, staged.filename
+                )
+                processor = BatchProcessor(
+                    self.db, event_callback=page_event, strip_refs=strip_refs
+                )
                 doc_meta = await processor.process_document(
                     staged.source_path,
                     run_id=run_id,
@@ -193,51 +205,70 @@ async def page_event(event: dict) -> None:
                 )
                 documents_meta.append((staged.document_id, paths, doc_meta))
                 observability.document_completed()
-                await self.db.update_run(run_id, documents_completed=len(documents_meta))
+                await self.db.update_run(
+                    run_id, documents_completed=len(documents_meta)
+                )
 
-            dataset_bundle = await self._maybe_export(run_id, documents_meta, export_parquet)
+            dataset_bundle = await self._maybe_export(
+                run_id, documents_meta, export_parquet
+            )
 
             completed_at = _now()
             await self.db.update_run(
                 run_id,
-                status="completed", stage="completed", progress=1.0,
+                status="completed",
+                stage="completed",
+                progress=1.0,
                 pages_completed=pages_total,
                 dataset_bundle=dataset_bundle,
                 completed_at=completed_at,
             )
             observability.job_completed()
-            await self._emit(run_id, "run_complete", {
-                "completed_at": completed_at,
-                "documents_total": len(result.documents),
-                "documents_completed": len(documents_meta),
-                "pages_total": pages_total,
-                "dataset_bundle": dataset_bundle,
-                **self._aggregate_totals(documents_meta),
-            })
+            await self._emit(
+                run_id,
+                "run_complete",
+                {
+                    "completed_at": completed_at,
+                    "documents_total": len(result.documents),
+                    "documents_completed": len(documents_meta),
+                    "pages_total": pages_total,
+                    "dataset_bundle": dataset_bundle,
+                    **self._aggregate_totals(documents_meta),
+                },
+            )
         except Exception as exc:
             logger.exception("Run %s failed", run_id)
             await self.db.update_run(
-                run_id, status="failed", stage="failed", error=str(exc), completed_at=_now(),
+                run_id,
+                status="failed",
+                stage="failed",
+                error=str(exc),
+                completed_at=_now(),
             )
             observability.job_failed()
             await self._emit(run_id, "run_failed", {"error": str(exc)})
 
-    async def _maybe_export(self, run_id: str, documents_meta: list, export_parquet: bool) -> str | None:
+    async def _maybe_export(
+        self, run_id: str, documents_meta: list, export_parquet: bool
+    ) -> str | None:
         if not (export_parquet and documents_meta):
             return None
         await self.db.update_run(run_id, stage="exporting")
         await self._emit(run_id, "dataset_export_started", {})
         exports = []
         for did, paths, meta in documents_meta:
-            exports.append(DocumentExport(
-                metadata=meta,
-                document_id=did,
-                artifact_paths=paths,
-                catalog_metadata=await self.db.get_document(did) or {},
-            ))
+            exports.append(
+                DocumentExport(
+                    metadata=meta,
+                    document_id=did,
+                    artifact_paths=paths,
+                    catalog_metadata=await self.db.get_document(did) or {},
+                )
+            )
         result = await asyncio.to_thread(
             DatasetExporter(self.storage.dataset_dir(run_id)).export_run,
-            run_id, exports,
+            run_id,
+            exports,
         )
         return str(result.bundle)
 
@@ -248,7 +279,9 @@ def _aggregate_totals(documents_meta) -> dict:
             "pages_warn": sum(m.pages_warn for _, _, m in documents_meta),
             "pages_fail": sum(m.pages_fail for _, _, m in documents_meta),
             "pages_empty": sum(m.pages_empty for _, _, m in documents_meta),
-            "total_time_ms": round(sum(m.total_processing_time_ms for _, _, m in documents_meta), 1),
+            "total_time_ms": round(
+                sum(m.total_processing_time_ms for _, _, m in documents_meta), 1
+            ),
         }
 
     # ---------- events ----------
@@ -268,7 +301,9 @@ async def _emit(self, run_id: str, event_type: str, payload: dict) -> None:
             except asyncio.QueueFull:
                 logger.warning("Listener queue full for run %s; dropping event", run_id)
 
-    async def subscribe(self, run_id: str, after_event_id: int = 0) -> AsyncIterator[dict]:
+    async def subscribe(
+        self, run_id: str, after_event_id: int = 0
+    ) -> AsyncIterator[dict]:
         for ev in await self.db.list_events(run_id, after_id=after_event_id):
             yield {**ev["payload"], "event_id": ev["id"]}
 
diff --git a/ocr_pipeline/services/script_detector.py b/ocr_pipeline/services/script_detector.py
index 051fd3c..633c8fa 100644
--- a/ocr_pipeline/services/script_detector.py
+++ b/ocr_pipeline/services/script_detector.py
@@ -1,12 +1,11 @@
 import unicodedata
 from dataclasses import dataclass
 from enum import Enum
-from collections import Counter
 
 
 class ScriptDirection(str, Enum):
-    LTR = "ltr"      # Latin, Cyrillic, etc.
-    RTL = "rtl"      # Arabic, Hebrew, etc.
+    LTR = "ltr"  # Latin, Cyrillic, etc.
+    RTL = "rtl"  # Arabic, Hebrew, etc.
     MIXED = "mixed"  # Both present significantly
     UNDETERMINED = "undetermined"
 
@@ -23,15 +22,15 @@ class ScriptFamily(str, Enum):
 class ScriptAnalysis:
     direction: ScriptDirection
     primary_script: ScriptFamily
-    ltr_ratio: float                # 0.0 to 1.0
-    rtl_ratio: float                # 0.0 to 1.0
+    ltr_ratio: float  # 0.0 to 1.0
+    rtl_ratio: float  # 0.0 to 1.0
     arabic_char_count: int
     latin_char_count: int
-    extended_latin_count: int       # Characters with diacritics beyond basic ASCII
+    extended_latin_count: int  # Characters with diacritics beyond basic ASCII
     has_diacritics: bool
-    sample_rtl_chars: str           # First few RTL characters found
-    sample_ltr_chars: str           # First few LTR characters found
-    detected_languages: list[str]   # Best-guess language hints
+    sample_rtl_chars: str  # First few RTL characters found
+    sample_ltr_chars: str  # First few LTR characters found
+    detected_languages: list[str]  # Best-guess language hints
 
 
 class ScriptDetector:
diff --git a/ocr_pipeline/static/css/style.css b/ocr_pipeline/static/css/style.css
index eb90fa6..58d630f 100644
--- a/ocr_pipeline/static/css/style.css
+++ b/ocr_pipeline/static/css/style.css
@@ -68,7 +68,6 @@ a { color: var(--accent); }
 
 .title-row { display: flex; align-items: baseline; gap: 10px; }
 .title-row h1 { margin: 0; font-size: 1.6rem; line-height: 1; }
-.version { font-size: 0.85rem; color: var(--muted); }
 
 .topbar-meta { display: flex; align-items: center; gap: 16px; }
 
@@ -256,13 +255,16 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
 .file-size { color: var(--muted); font-size: 0.85rem; white-space: nowrap; }
 
 .intake-options { display: grid; gap: 12px; grid-template-columns: 1fr; }
-.field { display: flex; flex-direction: column; gap: 6px; }
-.field span { font-size: 0.84rem; color: var(--muted); font-weight: 600; }
+.field { display: flex; flex-direction: column; gap: 5px; min-width: 0; }
+.field span { font-size: 0.76rem; color: var(--muted); font-weight: 700; }
 .field input[type="text"], .field input[type="password"] {
-  padding: 10px 14px;
+  min-width: 0;
+  height: 36px;
+  padding: 8px 11px;
   border: 1px solid var(--border-strong);
-  border-radius: var(--radius-sm);
+  border-radius: 8px;
   font: inherit;
+  font-size: 0.88rem;
   background: var(--surface-strong);
 }
 .field input:focus { outline: 2px solid var(--accent); outline-offset: 1px; border-color: transparent; }
@@ -288,7 +290,7 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
 
 .document-workbench-body {
   display: grid;
-  grid-template-columns: minmax(0, 1fr) 340px;
+  grid-template-columns: minmax(0, 1fr) 390px;
   gap: 14px;
   padding: 16px 24px 24px;
 }
@@ -312,6 +314,21 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
 .document-toolbar.drag-over { background: var(--accent-soft); }
 
 .document-table { display: grid; }
+.document-group { display: grid; }
+.document-group-head {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  min-height: 28px;
+  padding: 7px 12px;
+  border-bottom: 1px solid var(--border);
+  background: rgba(31, 109, 85, 0.07);
+  color: var(--muted);
+  font-size: 0.72rem;
+  font-weight: 700;
+  letter-spacing: 0.06em;
+  text-transform: uppercase;
+}
 .document-row {
   display: grid;
   grid-template-columns: 24px minmax(220px, 1fr) 86px 84px 78px;
@@ -341,7 +358,7 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
 .document-title span { color: var(--muted); font-size: 0.78rem; }
 
 .document-editor-head {
-  padding: 14px 16px;
+  padding: 12px 14px;
   border-bottom: 1px solid var(--border);
 }
 .document-editor-head .eyebrow {
@@ -352,13 +369,25 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
   font-size: 0.66rem;
   font-weight: 700;
 }
-.document-editor-head h3 { margin: 0; font-size: 1rem; }
-.document-editor-fields { display: grid; gap: 10px; padding: 14px 16px; }
-.field-row { display: grid; grid-template-columns: 1fr 1fr; gap: 10px; }
+.document-editor-head h3 {
+  margin: 0;
+  font-size: 0.96rem;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
+}
+.document-editor-fields {
+  display: grid;
+  grid-template-columns: repeat(2, minmax(0, 1fr));
+  gap: 10px 12px;
+  padding: 12px 14px;
+}
+.document-editor-fields .field-wide { grid-column: 1 / -1; }
+.field-row { display: contents; }
 .document-editor-actions {
   display: flex;
   justify-content: flex-end;
-  padding: 12px 16px;
+  padding: 10px 14px;
   border-top: 1px solid var(--border);
 }
 
@@ -554,9 +583,29 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
   min-height: 200px;
   max-height: 380px;
 }
+.inspector-text-head {
+  position: sticky;
+  top: 0;
+  z-index: 1;
+  display: flex;
+  justify-content: space-between;
+  gap: 12px;
+  padding: 8px 14px;
+  border-bottom: 1px solid var(--border);
+  background: #fdfaf3;
+  color: var(--muted);
+  font-size: 0.76rem;
+  font-weight: 700;
+  letter-spacing: 0.06em;
+  text-transform: uppercase;
+}
+.inspector-text-head code {
+  font-family: var(--font-mono);
+  color: var(--accent);
+}
 .inspector-text pre {
   margin: 0;
-  padding: 16px 20px;
+  padding: 14px 16px;
   white-space: pre-wrap;
   word-break: break-word;
   font: 0.86rem/1.55 var(--font-mono);
diff --git a/ocr_pipeline/static/index.html b/ocr_pipeline/static/index.html
index 242915b..9c53778 100644
--- a/ocr_pipeline/static/index.html
+++ b/ocr_pipeline/static/index.html
@@ -16,10 +16,9 @@
 
 <header class="topbar">
   <div class="brand">
-    <p class="eyebrow">OCR Operations · by cdli.ai</p>
+    <p class="eyebrow">cdli.ai</p>
     <div class="title-row">
       <h1>OpenCR</h1>
-      <span class="version" x-text="version ? 'v' + version : ''"></span>
     </div>
   </div>
   <div class="topbar-meta">
@@ -130,23 +129,31 @@ <h2>Documents</h2>
               <span>Metadata</span>
               <span>Last run</span>
             </div>
-            <template x-for="doc in documents" :key="doc.id">
-              <div class="document-row"
-                   :class="{ active: selectedDocumentId === doc.id }"
-                   @click="selectDocument(doc.id)">
-                <input type="checkbox"
-                       :checked="selectedDocumentIds.includes(doc.id)"
-                       @click.stop
-                       @change="toggleDocument(doc.id)">
-                <div class="document-title">
-                  <strong x-text="doc.display_title || doc.filename"></strong>
-                  <span x-text="doc.filename"></span>
+            <template x-for="group in groupedDocuments()" :key="group.name">
+              <div class="document-group">
+                <div class="document-group-head" x-show="group.name !== 'Ungrouped'">
+                  <span x-text="group.name"></span>
+                  <span x-text="group.items.length"></span>
                 </div>
-                <span x-text="doc.document_date_label || '—'"></span>
-                <span class="pill pill-sm"
-                      :class="doc.metadata_complete ? 'pill-success' : 'pill-warn'"
-                      x-text="doc.metadata_complete ? 'ready' : 'missing'"></span>
-                <span x-text="doc.latest_run_status || 'never'"></span>
+                <template x-for="doc in group.items" :key="doc.id">
+                  <div class="document-row"
+                       :class="{ active: selectedDocumentId === doc.id }"
+                       @click="selectDocument(doc.id)">
+                    <input type="checkbox"
+                           :checked="selectedDocumentIds.includes(doc.id)"
+                           @click.stop
+                           @change="toggleDocument(doc.id)">
+                    <div class="document-title">
+                      <strong x-text="doc.display_title || doc.filename"></strong>
+                      <span x-text="doc.filename"></span>
+                    </div>
+                    <span x-text="doc.document_date_label || '—'"></span>
+                    <span class="pill pill-sm"
+                          :class="doc.metadata_complete ? 'pill-success' : 'pill-warn'"
+                          x-text="doc.metadata_complete ? 'ready' : 'missing'"></span>
+                    <span x-text="doc.latest_run_status || 'never'"></span>
+                  </div>
+                </template>
               </div>
             </template>
           </div>
@@ -159,7 +166,8 @@ <h2>Documents</h2>
             <h3 x-text="documentDraft.display_title || documentDraft.filename"></h3>
           </header>
           <div class="document-editor-fields">
-            <label class="field"><span>Title</span><input type="text" x-model="documentDraft.display_title"></label>
+            <label class="field field-wide"><span>Title</span><input type="text" x-model="documentDraft.display_title"></label>
+            <label class="field"><span>Group</span><input type="text" placeholder="Collection/Folder" x-model="documentDraft.group_path"></label>
             <label class="field"><span>Author</span><input type="text" x-model="documentDraft.author"></label>
             <label class="field"><span>Work</span><input type="text" x-model="documentDraft.work"></label>
             <label class="field"><span>Book</span><input type="text" x-model="documentDraft.book"></label>
@@ -172,8 +180,8 @@ <h3 x-text="documentDraft.display_title || documentDraft.filename"></h3>
               <label class="field"><span>Script</span><input type="text" placeholder="latin_extended" x-model="documentDraft.script"></label>
             </div>
             <label class="field"><span>License</span><input type="text" placeholder="cc-by-4.0" x-model="documentDraft.license"></label>
-            <label class="field"><span>Source / citation</span><input type="text" x-model="documentDraft.source_citation"></label>
-            <label class="field"><span>Notes</span><input type="text" x-model="documentDraft.notes"></label>
+            <label class="field field-wide"><span>Source / citation</span><input type="text" x-model="documentDraft.source_citation"></label>
+            <label class="field field-wide"><span>Notes</span><input type="text" x-model="documentDraft.notes"></label>
           </div>
           <footer class="document-editor-actions">
             <button class="btn btn-primary" :disabled="savingDocument" @click="saveSelectedDocument">
@@ -307,7 +315,11 @@ <h3 x-text="inspector.document?.filename || ''"></h3>
     </div>
 
     <div class="inspector-text">
-      <pre x-text="inspector.text || '(empty)'"></pre>
+      <div class="inspector-text-head">
+        <span>OCR snapshot</span>
+        <code x-text="`page ${inspector.pageNum}`"></code>
+      </div>
+      <pre x-text="selectedPageText() || '(empty)'"></pre>
     </div>
 
     <div class="inspector-footer">
diff --git a/ocr_pipeline/static/js/app.js b/ocr_pipeline/static/js/app.js
index df122f9..90dc9c4 100644
--- a/ocr_pipeline/static/js/app.js
+++ b/ocr_pipeline/static/js/app.js
@@ -199,6 +199,14 @@ function opencrApp() {
       return API.pageImageUrl(this.selectedRunId, this.inspector.documentId, pageNum);
     },
 
+    selectedPageText() {
+      const text = this.inspector.text || '';
+      if (!text) return '';
+      const parts = text.split('\n\f\n');
+      if (parts.length <= 1) return text;
+      return parts[this.inspector.pageNum - 1] || '';
+    },
+
     pageStatusFor(pageNum) {
       return (this.inspector.document?.pages || []).find(p => p.page_num === pageNum)?.status || 'pending';
     },
@@ -246,6 +254,16 @@ function opencrApp() {
       return this.documents.find(d => d.id === this.selectedDocumentId) || null;
     },
 
+    groupedDocuments() {
+      const groups = new Map();
+      for (const doc of this.documents) {
+        const name = (doc.group_path || '').trim() || 'Ungrouped';
+        if (!groups.has(name)) groups.set(name, []);
+        groups.get(name).push(doc);
+      }
+      return [...groups.entries()].map(([name, items]) => ({ name, items }));
+    },
+
     selectDocument(documentId) {
       this.selectedDocumentId = documentId;
       const doc = this.selectedDocument();
@@ -278,6 +296,7 @@ function opencrApp() {
       try {
         await API.updateDocument(this.selectedDocumentId, {
           display_title: this.documentDraft.display_title || null,
+          group_path: this.documentDraft.group_path || null,
           author: this.documentDraft.author || null,
           work: this.documentDraft.work || null,
           book: this.documentDraft.book || null,
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..20bf9e2
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,3 @@
+pytest>=8.0.0
+pytest-asyncio>=0.23.0
+ruff>=0.8.0
diff --git a/scripts/run_batch.py b/scripts/run_batch.py
index 4139a1a..2119d8e 100644
--- a/scripts/run_batch.py
+++ b/scripts/run_batch.py
@@ -3,7 +3,6 @@
 
 import argparse
 import asyncio
-import json
 import sys
 import time
 from pathlib import Path
@@ -22,7 +21,9 @@ async def progress_callback(event: dict):
         doc = event["document"]
         page = event["page"]
         total = event["total_pages"]
-        print(f"  [{page}/{total}] Processing {doc} page {page} ({event['mode']}, {event['dpi']} DPI)...")
+        print(
+            f"  [{page}/{total}] Processing {doc} page {page} ({event['mode']}, {event['dpi']} DPI)..."
+        )
 
     elif event_type == "page_complete":
         doc = event["document"]
@@ -51,7 +52,9 @@ async def progress_callback(event: dict):
         p = event["pages_pass"]
         w = event["pages_warn"]
         f = event["pages_fail"]
-        print(f"\n  {doc}: {pages} pages | pass={p} warn={w} fail={f} | {time_ms:.0f}ms")
+        print(
+            f"\n  {doc}: {pages} pages | pass={p} warn={w} fail={f} | {time_ms:.0f}ms"
+        )
         print(f"  Output: {event['output_path']}")
 
 
@@ -66,7 +69,8 @@ async def main():
         help="PDF file(s) or directory containing PDFs",
     )
     parser.add_argument(
-        "-o", "--output",
+        "-o",
+        "--output",
         default=None,
         help=f"Output directory (default: {settings.output_dir})",
     )
diff --git a/tests/test_dataset_exporter.py b/tests/test_dataset_exporter.py
index f542725..20493cc 100644
--- a/tests/test_dataset_exporter.py
+++ b/tests/test_dataset_exporter.py
@@ -32,6 +32,7 @@ def test_export_run_writes_parquet_manifest_and_bundle(tmp_path):
         artifact_paths=paths,
         catalog_metadata={
             "author": "Evliyâ Çelebi",
+            "group_path": "Ottoman/Seyahatname",
             "work": "Seyahatnâme",
             "book": "1",
             "document_date_label": "1900s",
@@ -52,16 +53,29 @@ def test_export_run_writes_parquet_manifest_and_bundle(tmp_path):
     documents_table = pq.read_table(result.documents_parquet)
     assert pages_table.num_rows == 2
     assert documents_table.num_rows == 1
+    assert "page_id" in pages_table.column_names
     assert "raw_text" in pages_table.column_names
     assert "clean_text" in pages_table.column_names
+    assert "clean_text_sha256" in pages_table.column_names
     assert "author" in pages_table.column_names
+    assert pages_table.column("group_path").to_pylist() == [
+        "Ottoman/Seyahatname",
+        "Ottoman/Seyahatname",
+    ]
     assert pages_table.column("work").to_pylist() == ["Seyahatnâme", "Seyahatnâme"]
-    assert pages_table.column("document_date_precision").to_pylist() == ["century", "century"]
+    assert pages_table.column("document_date_precision").to_pylist() == [
+        "century",
+        "century",
+    ]
     assert "split" in documents_table.column_names
+    assert "clean_text_sha256" in documents_table.column_names
     assert documents_table.column("author").to_pylist() == ["Evliyâ Çelebi"]
+    assert documents_table.column("group_path").to_pylist() == ["Ottoman/Seyahatname"]
 
     manifest = json.loads(result.manifest.read_text(encoding="utf-8"))
     assert manifest["run_id"] == "run-1234"
+    assert manifest["created_by"]["project"] == "opencr"
+    assert manifest["created_by"]["organization"] == "cdli.ai"
     assert manifest["documents_count"] == 1
     assert manifest["pages_count"] == 2
     assert manifest["schema_version"] == 2
diff --git a/tests/test_document_catalog.py b/tests/test_document_catalog.py
index f7343a4..1b75bc6 100644
--- a/tests/test_document_catalog.py
+++ b/tests/test_document_catalog.py
@@ -19,6 +19,7 @@ async def _scenario():
 
             updated = await db.update_document_metadata(
                 "doc-1",
+                group_path="Ottoman/Seyahatname",
                 author="Evliyâ Çelebi",
                 work="Seyahatnâme",
                 book="1",
@@ -30,6 +31,7 @@ async def _scenario():
             )
 
             assert updated["author"] == "Evliyâ Çelebi"
+            assert updated["group_path"] == "Ottoman/Seyahatname"
             assert updated["document_date_label"] == "1900s"
             assert updated["document_date_precision"] == "century"
 
diff --git a/tests/test_ui_routes.py b/tests/test_ui_routes.py
index 7a24e66..f0d8355 100644
--- a/tests/test_ui_routes.py
+++ b/tests/test_ui_routes.py
@@ -23,7 +23,9 @@ async def fake_wait_for_model_server():
     monkeypatch.setattr(settings, "output_dir", output_dir)
     monkeypatch.setattr(settings, "runs_dir", runs_dir)
     monkeypatch.setattr(settings, "db_path", db_path)
-    monkeypatch.setattr(main_module, "wait_for_model_server", fake_wait_for_model_server)
+    monkeypatch.setattr(
+        main_module, "wait_for_model_server", fake_wait_for_model_server
+    )
     model_readiness.ready = True
     model_readiness.error = None
 
@@ -49,6 +51,7 @@ async def fake_wait_for_model_server():
             f"/api/documents/{docs[0]['id']}",
             json={
                 "author": "Evliyâ Çelebi",
+                "group_path": "Ottoman/Seyahatname",
                 "work": "Seyahatnâme",
                 "book": "1",
                 "document_date_label": "1900s",
@@ -60,6 +63,7 @@ async def fake_wait_for_model_server():
         )
         assert patch.status_code == 200
         assert patch.json()["metadata_complete"] is True
+        assert patch.json()["group_path"] == "Ottoman/Seyahatname"
 
 
 def test_runs_list_empty_when_no_runs(tmp_path, monkeypatch):
@@ -74,7 +78,9 @@ async def fake_wait_for_model_server():
     monkeypatch.setattr(settings, "output_dir", output_dir)
     monkeypatch.setattr(settings, "runs_dir", output_dir / "runs")
     monkeypatch.setattr(settings, "db_path", output_dir / "opencr.sqlite")
-    monkeypatch.setattr(main_module, "wait_for_model_server", fake_wait_for_model_server)
+    monkeypatch.setattr(
+        main_module, "wait_for_model_server", fake_wait_for_model_server
+    )
     model_readiness.ready = True
 
     with TestClient(main_module.app) as client:
@@ -105,5 +111,9 @@ def test_home_uses_document_workbench():
     assert 'class="document-workbench"' in html
     assert "document_date_label" in html
     assert "document_date_precision" in html
+    assert "group_path" in html
+    assert "OCR snapshot" in html
     assert "selectedDocumentIds" in app_js
+    assert "groupedDocuments()" in app_js
+    assert "selectedPageText()" in app_js
     assert "saveSelectedDocument()" in app_js

From df9555e1b76e08f987d519857b8e93f2fca61e69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= <fatihburak@protonmail.com>
Date: Mon, 11 May 2026 21:32:45 +0300
Subject: [PATCH 04/15] feat: add bulk document update endpoint and OCR pair
 dataset exporter service

---
 ocr_pipeline/models/schemas.py             |   5 +
 ocr_pipeline/routers/documents.py          |  27 ++-
 ocr_pipeline/routers/runs.py               | 126 +++++++++---
 ocr_pipeline/services/ocr_pair_exporter.py | 212 +++++++++++++++++++++
 ocr_pipeline/static/css/style.css          |  19 +-
 ocr_pipeline/static/index.html             |  22 +++
 ocr_pipeline/static/js/api.js              |  17 ++
 ocr_pipeline/static/js/app.js              |  62 +++++-
 tests/test_ocr_pair_exporter.py            | 117 ++++++++++++
 tests/test_ui_routes.py                    |  54 ++++++
 10 files changed, 634 insertions(+), 27 deletions(-)
 create mode 100644 ocr_pipeline/services/ocr_pair_exporter.py
 create mode 100644 tests/test_ocr_pair_exporter.py

diff --git a/ocr_pipeline/models/schemas.py b/ocr_pipeline/models/schemas.py
index 39da55a..bdf0e79 100644
--- a/ocr_pipeline/models/schemas.py
+++ b/ocr_pipeline/models/schemas.py
@@ -99,6 +99,11 @@ class DocumentUpdate(BaseModel):
     tags_json: Optional[str] = None
 
 
+class BulkDocumentUpdate(BaseModel):
+    document_ids: list[str]
+    group_path: Optional[str] = None
+
+
 class DocumentSummary(BaseModel):
     id: str
     filename: str
diff --git a/ocr_pipeline/routers/documents.py b/ocr_pipeline/routers/documents.py
index a94e878..b301e17 100644
--- a/ocr_pipeline/routers/documents.py
+++ b/ocr_pipeline/routers/documents.py
@@ -1,6 +1,11 @@
 from fastapi import APIRouter, HTTPException, Path as PathParam, Query
 
-from ocr_pipeline.models.schemas import DocumentSummary, DocumentUpdate, RunSummary
+from ocr_pipeline.models.schemas import (
+    BulkDocumentUpdate,
+    DocumentSummary,
+    DocumentUpdate,
+    RunSummary,
+)
 from ocr_pipeline.routers.runs import _run_summary
 from ocr_pipeline.services.db import get_db
 
@@ -24,6 +29,26 @@ async def list_documents(limit: int = Query(500, ge=1, le=1000)):
     return [_document_summary(d) for d in await get_db().list_documents(limit=limit)]
 
 
+@router.patch("/api/documents/bulk", response_model=list[DocumentSummary])
+async def update_documents_bulk(payload: BulkDocumentUpdate):
+    if not payload.document_ids:
+        raise HTTPException(status_code=400, detail="document_ids must not be empty")
+    db = get_db()
+    for document_id in payload.document_ids:
+        try:
+            await db.update_document_metadata(
+                document_id,
+                group_path=payload.group_path,
+            )
+        except KeyError:
+            raise HTTPException(
+                status_code=404, detail=f"Document not found: {document_id}"
+            )
+    documents = await db.list_documents(limit=1000)
+    selected = {document_id for document_id in payload.document_ids}
+    return [_document_summary(doc) for doc in documents if doc["id"] in selected]
+
+
 @router.get("/api/documents/{document_id}", response_model=DocumentSummary)
 async def get_document(document_id: str = ID):
     doc = await get_db().get_document(document_id)
diff --git a/ocr_pipeline/routers/runs.py b/ocr_pipeline/routers/runs.py
index 5f3e4df..d6b2e1e 100644
--- a/ocr_pipeline/routers/runs.py
+++ b/ocr_pipeline/routers/runs.py
@@ -4,17 +4,34 @@
 from pathlib import Path
 
 from fastapi import APIRouter, HTTPException, Path as PathParam, Query, Request
-from fastapi.responses import FileResponse, PlainTextResponse, Response, StreamingResponse
+from fastapi.responses import (
+    FileResponse,
+    PlainTextResponse,
+    Response,
+    StreamingResponse,
+)
 
 from ocr_pipeline.config import settings
 from ocr_pipeline.models.schemas import (
-    HFPublishRequest, HFPublishResponse, PageSummary, RunCreateRequest,
-    RunCreateResponse, RunDetail, RunDocumentDetail, RunDocumentSummary,
-    RunSummary, StagedDocumentInfo,
+    HFPublishRequest,
+    HFPublishResponse,
+    PageSummary,
+    RunCreateRequest,
+    RunCreateResponse,
+    RunDetail,
+    RunDocumentDetail,
+    RunDocumentSummary,
+    RunSummary,
+    StagedDocumentInfo,
+)
+from ocr_pipeline.services.auth_session import (
+    is_oauth_enabled,
+    session_token,
+    session_user,
 )
-from ocr_pipeline.services.auth_session import is_oauth_enabled, session_token, session_user
 from ocr_pipeline.services.db import get_db
 from ocr_pipeline.services.hf_publisher import publish_run_to_hf
+from ocr_pipeline.services.ocr_pair_exporter import OCRPairExporter
 from ocr_pipeline.services.pdf_renderer import PDFRenderer
 from ocr_pipeline.services.run_orchestrator import get_orchestrator
 from ocr_pipeline.services.startup import model_readiness
@@ -33,9 +50,12 @@
     "source": ("artifact_source_pdf", "application/pdf"),
 }
 TEXT_MODES = {
-    "raw": "artifact_raw_txt", "raw_txt": "artifact_raw_txt",
-    "txt": "artifact_clean_txt", "clean": "artifact_clean_txt",
-    "md": "artifact_markdown", "markdown": "artifact_markdown",
+    "raw": "artifact_raw_txt",
+    "raw_txt": "artifact_raw_txt",
+    "txt": "artifact_clean_txt",
+    "clean": "artifact_clean_txt",
+    "md": "artifact_markdown",
+    "markdown": "artifact_markdown",
 }
 
 
@@ -103,6 +123,7 @@ def _doc_summary(row: dict) -> RunDocumentSummary:
 def _page_summary(row: dict) -> PageSummary:
     def _bool(v):
         return bool(v) if v is not None else None
+
     return PageSummary(
         page_num=row["page_num"],
         status=row["status"],
@@ -149,7 +170,9 @@ def _existing_path(rd: dict, field: str) -> Path:
 @router.post("/api/runs", response_model=RunCreateResponse)
 async def create_run(request: RunCreateRequest):
     if not model_readiness.ready:
-        raise HTTPException(status_code=503, detail=f"Model server not ready: {model_readiness.status}")
+        raise HTTPException(
+            status_code=503, detail=f"Model server not ready: {model_readiness.status}"
+        )
     if not request.file_paths:
         raise HTTPException(status_code=400, detail="file_paths must not be empty")
 
@@ -164,7 +187,9 @@ async def create_run(request: RunCreateRequest):
     except FileNotFoundError as exc:
         raise HTTPException(status_code=404, detail=str(exc))
 
-    orchestrator.start(result, strip_refs=request.strip_refs, export_parquet=request.export_parquet)
+    orchestrator.start(
+        result, strip_refs=request.strip_refs, export_parquet=request.export_parquet
+    )
 
     return RunCreateResponse(
         run_id=result.run_id,
@@ -193,29 +218,37 @@ async def list_runs(limit: int = Query(50, ge=1, le=500)):
 async def get_run(run_id: str = ID):
     run = await _require_run(run_id)
     documents = await get_db().list_run_documents(run_id)
-    return RunDetail(**_run_summary(run).model_dump(),
-                     documents=[_doc_summary(d) for d in documents])
+    return RunDetail(
+        **_run_summary(run).model_dump(), documents=[_doc_summary(d) for d in documents]
+    )
 
 
 @router.delete("/api/runs/{run_id}")
 async def delete_run(run_id: str = ID):
     run = await _require_run(run_id)
     if run["status"] == "processing":
-        raise HTTPException(status_code=409, detail="Cannot delete a run that is still processing")
+        raise HTTPException(
+            status_code=409, detail="Cannot delete a run that is still processing"
+        )
     await get_db().delete_run(run_id)
     return {"deleted": run_id}
 
 
-@router.get("/api/runs/{run_id}/documents/{document_id}", response_model=RunDocumentDetail)
+@router.get(
+    "/api/runs/{run_id}/documents/{document_id}", response_model=RunDocumentDetail
+)
 async def get_run_document(run_id: str = ID, document_id: str = ID):
     rd = await _require_doc(run_id, document_id)
     pages = await get_db().list_pages(run_id, document_id)
-    return RunDocumentDetail(**_doc_summary(rd).model_dump(),
-                             pages=[_page_summary(p) for p in pages])
+    return RunDocumentDetail(
+        **_doc_summary(rd).model_dump(), pages=[_page_summary(p) for p in pages]
+    )
 
 
 @router.get("/api/runs/{run_id}/documents/{document_id}/text")
-async def get_run_document_text(run_id: str = ID, document_id: str = ID, mode: str = "txt"):
+async def get_run_document_text(
+    run_id: str = ID, document_id: str = ID, mode: str = "txt"
+):
     field = TEXT_MODES.get(mode)
     if not field:
         raise HTTPException(status_code=400, detail="Unsupported mode")
@@ -267,7 +300,48 @@ async def download_dataset_bundle(run_id: str = ID):
     bundle = run.get("dataset_bundle")
     if not bundle or not Path(bundle).exists():
         raise HTTPException(status_code=404, detail="Dataset bundle not available")
-    return FileResponse(bundle, media_type="application/zip", filename=Path(bundle).name)
+    return FileResponse(
+        bundle, media_type="application/zip", filename=Path(bundle).name
+    )
+
+
+@router.get("/api/runs/{run_id}/ocr-pairs/download")
+async def download_ocr_pairs(
+    run_id: str = ID,
+    dpi: int = Query(160, ge=50, le=400),
+    text_mode: str = Query("clean", pattern="^(clean|raw)$"),
+):
+    db = get_db()
+    run = await _require_run(run_id)
+    if run["status"] != "completed":
+        raise HTTPException(status_code=409, detail="Run is not yet completed")
+
+    documents = await db.list_run_documents(run_id)
+    pages_by_document = {
+        doc["document_id"]: await db.list_pages(run_id, doc["document_id"])
+        for doc in documents
+    }
+    catalog_by_document = {
+        doc["document_id"]: await db.get_document(doc["document_id"]) or {}
+        for doc in documents
+    }
+    exporter = OCRPairExporter(settings.runs_dir / run_id / "dataset" / "ocr_pairs")
+    result = await asyncio.to_thread(
+        exporter.export_run,
+        run=run,
+        documents=documents,
+        pages_by_document=pages_by_document,
+        catalog_by_document=catalog_by_document,
+        dpi=dpi,
+        text_mode=text_mode,
+    )
+    if result.pages_count == 0:
+        raise HTTPException(status_code=404, detail="No completed OCR pages to export")
+    return FileResponse(
+        result.bundle,
+        media_type="application/zip",
+        filename=f"{run_id}-ocr-pairs.zip",
+    )
 
 
 @router.get("/api/runs/{run_id}/stream")
@@ -276,14 +350,18 @@ async def stream_run(run_id: str = ID, after_event_id: int = 0):
     orchestrator = get_orchestrator()
 
     async def gen():
-        async for event in orchestrator.subscribe(run_id, after_event_id=after_event_id):
+        async for event in orchestrator.subscribe(
+            run_id, after_event_id=after_event_id
+        ):
             yield f"data: {json.dumps(event, ensure_ascii=False, default=str)}\n\n"
 
     return StreamingResponse(gen(), media_type="text/event-stream")
 
 
 @router.post("/api/runs/{run_id}/publish/hf", response_model=HFPublishResponse)
-async def publish_to_hf(payload: HFPublishRequest, http_request: Request, run_id: str = ID):
+async def publish_to_hf(
+    payload: HFPublishRequest, http_request: Request, run_id: str = ID
+):
     db = get_db()
     run = await _require_run(run_id)
     if run["status"] != "completed":
@@ -293,7 +371,9 @@ async def publish_to_hf(payload: HFPublishRequest, http_request: Request, run_id
     #   1. signed-in user's HF OAuth token (preferred — tied to a real user)
     #   2. token explicitly passed in the request body (paste-token mode)
     #   3. HF_TOKEN env var (single-user / dev fallback, resolved inside publisher)
-    user = session_user(http_request.session) if hasattr(http_request, "session") else None
+    user = (
+        session_user(http_request.session) if hasattr(http_request, "session") else None
+    )
     sess_tok = session_token(http_request.session) if user else None
 
     # If OAuth is enabled and the user is signed in, ignore the body token —
@@ -302,7 +382,9 @@ async def publish_to_hf(payload: HFPublishRequest, http_request: Request, run_id
     # publishes entirely so the panel acts as a true gate.
     if is_oauth_enabled():
         if not sess_tok:
-            raise HTTPException(status_code=401, detail="Sign in with HuggingFace to publish.")
+            raise HTTPException(
+                status_code=401, detail="Sign in with HuggingFace to publish."
+            )
         token = sess_tok
     else:
         token = payload.token
diff --git a/ocr_pipeline/services/ocr_pair_exporter.py b/ocr_pipeline/services/ocr_pair_exporter.py
new file mode 100644
index 0000000..a4cdb40
--- /dev/null
+++ b/ocr_pipeline/services/ocr_pair_exporter.py
@@ -0,0 +1,212 @@
+import hashlib
+import json
+import shutil
+import zipfile
+from dataclasses import dataclass
+from pathlib import Path
+
+from ocr_pipeline.config import settings
+from ocr_pipeline.services.dataset_exporter import PROJECT_METADATA
+from ocr_pipeline.services.output_writer import PAGE_BREAK
+from ocr_pipeline.services.pdf_renderer import PDFRenderer
+
+
+@dataclass(frozen=True)
+class OCRPairExportResult:
+    export_dir: Path
+    bundle: Path
+    pages_count: int
+
+
+class OCRPairExporter:
+    """Builds image/text pairs for OCR model fine-tuning."""
+
+    def __init__(self, export_dir: Path, renderer: PDFRenderer | None = None):
+        self.export_dir = export_dir
+        self.renderer = renderer or PDFRenderer()
+
+    @staticmethod
+    def _split_pages(text: str, total_pages: int) -> list[str]:
+        pages = text.split(PAGE_BREAK) if text else [""]
+        if len(pages) < total_pages:
+            pages.extend([""] * (total_pages - len(pages)))
+        return pages[:total_pages]
+
+    @staticmethod
+    def _split_name(stable_key: str) -> str:
+        bucket = (
+            int(hashlib.sha256(stable_key.encode("utf-8")).hexdigest()[:8], 16) % 100
+        )
+        if bucket < 90:
+            return "train"
+        if bucket < 95:
+            return "validation"
+        return "test"
+
+    @staticmethod
+    def _json_list(raw: str | None) -> list[str]:
+        if not raw:
+            return []
+        try:
+            value = json.loads(raw)
+        except json.JSONDecodeError:
+            return [part.strip() for part in raw.split(",") if part.strip()]
+        return [str(item) for item in value] if isinstance(value, list) else []
+
+    @staticmethod
+    def _language_list(value) -> list[str]:
+        if isinstance(value, list):
+            return [str(item).strip() for item in value if str(item).strip()]
+        if not value:
+            return []
+        return [part.strip() for part in str(value).split(",") if part.strip()]
+
+    def export_run(
+        self,
+        *,
+        run: dict,
+        documents: list[dict],
+        pages_by_document: dict[str, list[dict]],
+        catalog_by_document: dict[str, dict],
+        dpi: int = 160,
+        text_mode: str = "clean",
+    ) -> OCRPairExportResult:
+        if text_mode not in {"clean", "raw"}:
+            raise ValueError("text_mode must be clean or raw")
+
+        if self.export_dir.exists():
+            shutil.rmtree(self.export_dir)
+        images_dir = self.export_dir / "images"
+        images_dir.mkdir(parents=True, exist_ok=True)
+
+        split_rows: dict[str, list[dict]] = {"train": [], "validation": [], "test": []}
+        pages_count = 0
+
+        for doc in documents:
+            if doc.get("status") != "completed":
+                continue
+            document_id = doc["document_id"]
+            catalog = catalog_by_document.get(document_id, {})
+            pdf_path_str = doc.get("artifact_source_pdf") or doc.get(
+                "document_source_path"
+            )
+            if not pdf_path_str:
+                continue
+            pdf_path = Path(pdf_path_str)
+            if not pdf_path.exists():
+                continue
+
+            total_pages = int(
+                doc.get("total_pages")
+                or len(pages_by_document.get(document_id, []))
+                or 0
+            )
+            raw_pages = self._split_pages(
+                self._read_text(doc.get("artifact_raw_txt")), total_pages
+            )
+            clean_pages = self._split_pages(
+                self._read_text(doc.get("artifact_clean_txt")), total_pages
+            )
+            page_rows = {
+                row["page_num"]: row for row in pages_by_document.get(document_id, [])
+            }
+
+            for page_num in range(1, total_pages + 1):
+                page_id = f"{document_id}_page_{page_num:04d}"
+                image_rel = f"images/{page_id}.png"
+                image = self.renderer.render_page(pdf_path, page_num, dpi)
+                image.save(images_dir / f"{page_id}.png", format="PNG")
+
+                page_meta = page_rows.get(page_num, {})
+                raw_text = raw_pages[page_num - 1]
+                clean_text = clean_pages[page_num - 1]
+                text = clean_text if text_mode == "clean" else raw_text
+                split = self._split_name(page_id)
+
+                split_rows[split].append(
+                    {
+                        "id": page_id,
+                        "image": image_rel,
+                        "text": text,
+                        "raw_text": raw_text,
+                        "clean_text": clean_text,
+                        "text_mode": text_mode,
+                        "document_id": document_id,
+                        "document_name": doc.get("document_filename"),
+                        "page": page_num,
+                        "group_path": catalog.get("group_path"),
+                        "title": catalog.get("display_title")
+                        or catalog.get("pdf_title"),
+                        "author": catalog.get("author") or catalog.get("pdf_author"),
+                        "work": catalog.get("work"),
+                        "book": catalog.get("book"),
+                        "document_date_label": catalog.get("document_date_label"),
+                        "document_date_precision": catalog.get(
+                            "document_date_precision"
+                        ),
+                        "language": self._language_list(catalog.get("language"))
+                        or self._json_list(page_meta.get("detected_languages")),
+                        "script": catalog.get("script")
+                        or page_meta.get("primary_script"),
+                        "ocr_status": page_meta.get("status"),
+                        "validation_issues": self._json_list(
+                            page_meta.get("validation_issues")
+                        ),
+                        "extraction_mode": page_meta.get("extraction_mode"),
+                        "extraction_attempt": page_meta.get("extraction_attempt"),
+                        "dpi_used": page_meta.get("dpi_used"),
+                        "source_file": doc.get("document_filename"),
+                        "source_pdf_sha256": doc.get("file_sha256"),
+                        "ocr_model": run.get("model_used"),
+                        "pipeline_version": run.get("pipeline_version"),
+                    }
+                )
+                pages_count += 1
+
+        self._write_jsonl(split_rows)
+        self._write_manifest(run, pages_count, dpi, text_mode)
+        bundle = self.export_dir.with_suffix(".zip")
+        if bundle.exists():
+            bundle.unlink()
+        with zipfile.ZipFile(bundle, "w", compression=zipfile.ZIP_DEFLATED) as archive:
+            for path in sorted(self.export_dir.rglob("*")):
+                if path.is_file():
+                    archive.write(path, arcname=path.relative_to(self.export_dir))
+        return OCRPairExportResult(self.export_dir, bundle, pages_count)
+
+    @staticmethod
+    def _read_text(path_str: str | None) -> str:
+        if not path_str:
+            return ""
+        path = Path(path_str)
+        return path.read_text(encoding="utf-8") if path.exists() else ""
+
+    def _write_jsonl(self, split_rows: dict[str, list[dict]]) -> None:
+        for split, rows in split_rows.items():
+            path = self.export_dir / f"{split}.jsonl"
+            path.write_text(
+                "".join(json.dumps(row, ensure_ascii=False) + "\n" for row in rows),
+                encoding="utf-8",
+            )
+
+    def _write_manifest(
+        self, run: dict, pages_count: int, dpi: int, text_mode: str
+    ) -> None:
+        payload = {
+            "export_type": "ocr_pairs",
+            "run_id": run["id"],
+            "created_by": PROJECT_METADATA,
+            "pages_count": pages_count,
+            "image_format": "png",
+            "dpi": dpi,
+            "text_mode": text_mode,
+            "schema_version": 1,
+            "ocr_model": run.get("model_used") or settings.model_name,
+            "pipeline_version": run.get("pipeline_version")
+            or settings.pipeline_version,
+            "splits": ["train", "validation", "test"],
+        }
+        (self.export_dir / "manifest.json").write_text(
+            json.dumps(payload, indent=2, ensure_ascii=False),
+            encoding="utf-8",
+        )
diff --git a/ocr_pipeline/static/css/style.css b/ocr_pipeline/static/css/style.css
index 58d630f..e06aaf4 100644
--- a/ocr_pipeline/static/css/style.css
+++ b/ocr_pipeline/static/css/style.css
@@ -312,6 +312,21 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
   border-bottom: 1px solid var(--border);
 }
 .document-toolbar.drag-over { background: var(--accent-soft); }
+.toolbar-input,
+.toolbar-select {
+  height: 30px;
+  min-width: 0;
+  border: 1px solid var(--border-strong);
+  border-radius: 999px;
+  background: var(--surface-strong);
+  color: var(--text);
+  font: inherit;
+  font-size: 0.82rem;
+}
+.toolbar-input { padding: 6px 11px; }
+.toolbar-select { padding: 5px 10px; }
+.toolbar-search { flex: 1; max-width: 260px; }
+.toolbar-group { width: 150px; }
 
 .document-table { display: grid; }
 .document-group { display: grid; }
@@ -690,9 +705,11 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
 
 @media (max-width: 720px) {
   .topbar { flex-direction: column; align-items: stretch; gap: 10px; }
+  .document-toolbar { flex-wrap: wrap; }
+  .toolbar-search, .toolbar-group { max-width: none; width: 100%; }
   .document-row { grid-template-columns: 24px minmax(160px, 1fr) 72px 76px; }
   .document-row > :last-child { display: none; }
-  .field-row { grid-template-columns: 1fr; }
+  .document-editor-fields { grid-template-columns: 1fr; }
   .toast-container { right: 10px; left: 10px; }
 }
 
diff --git a/ocr_pipeline/static/index.html b/ocr_pipeline/static/index.html
index 9c53778..412fb1e 100644
--- a/ocr_pipeline/static/index.html
+++ b/ocr_pipeline/static/index.html
@@ -117,6 +117,25 @@ <h2>Documents</h2>
               <input type="checkbox" :checked="allDocumentsSelected" @change="selectAllDocuments($event.target.checked)">
               <span><span x-text="selectedDocumentIds.length"></span> selected</span>
             </label>
+            <input class="toolbar-input toolbar-search"
+                   type="text"
+                   placeholder="Search"
+                   x-model="documentSearch">
+            <select class="toolbar-select" x-model="documentGroupFilter">
+              <option value="">All groups</option>
+              <template x-for="group in availableDocumentGroups()" :key="group">
+                <option :value="group" x-text="group"></option>
+              </template>
+            </select>
+            <input class="toolbar-input toolbar-group"
+                   type="text"
+                   placeholder="Set group"
+                   x-model="bulkGroupPath">
+            <button class="btn btn-ghost btn-sm"
+                    :disabled="selectedDocumentIds.length === 0 || bulkGrouping"
+                    @click="applyBulkGroup">
+              Group
+            </button>
             <button class="btn btn-ghost btn-sm" @click="refreshDocuments">Refresh</button>
             <span class="muted-note" x-show="uploadProgress !== null" x-text="'Uploading ' + uploadProgress + '%'"></span>
           </div>
@@ -238,6 +257,9 @@ <h2>Run <code x-text="selectedRun?.id"></code></h2>
         <button class="btn btn-ghost" @click="downloadDataset" x-show="selectedRun?.dataset_bundle">
           ↓ Dataset bundle (zip)
         </button>
+        <button class="btn btn-ghost" @click="downloadOCRPairs">
+          ↓ OCR pairs (zip)
+        </button>
         <button class="btn btn-primary"
                 @click="openHFModal"
                 :disabled="!canPublish"
diff --git a/ocr_pipeline/static/js/api.js b/ocr_pipeline/static/js/api.js
index 32bcbf5..37b4842 100644
--- a/ocr_pipeline/static/js/api.js
+++ b/ocr_pipeline/static/js/api.js
@@ -63,6 +63,19 @@ const API = {
     return res.json();
   },
 
+  async bulkUpdateDocuments(payload) {
+    const res = await fetch('/api/documents/bulk', {
+      method: 'PATCH',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify(payload),
+    });
+    if (!res.ok) {
+      const data = await res.json().catch(() => ({}));
+      throw new Error(data.detail || 'Failed to update documents');
+    }
+    return res.json();
+  },
+
   async createRun(filePaths, { name, stripRefs = false, exportParquet = true } = {}) {
     const res = await fetch('/api/runs', {
       method: 'POST',
@@ -130,6 +143,10 @@ const API = {
     return `/api/runs/${encodeURIComponent(runId)}/dataset/download`;
   },
 
+  ocrPairsDownloadUrl(runId, { dpi = 160, textMode = 'clean' } = {}) {
+    return `/api/runs/${encodeURIComponent(runId)}/ocr-pairs/download?dpi=${dpi}&text_mode=${encodeURIComponent(textMode)}`;
+  },
+
   async publishToHF(runId, payload) {
     const res = await fetch(`/api/runs/${encodeURIComponent(runId)}/publish/hf`, {
       method: 'POST',
diff --git a/ocr_pipeline/static/js/app.js b/ocr_pipeline/static/js/app.js
index 90dc9c4..9b18b20 100644
--- a/ocr_pipeline/static/js/app.js
+++ b/ocr_pipeline/static/js/app.js
@@ -36,6 +36,10 @@ function opencrApp() {
     selectedDocumentIds: [],
     selectedDocumentId: null,
     documentDraft: {},
+    documentSearch: '',
+    documentGroupFilter: '',
+    bulkGroupPath: '',
+    bulkGrouping: false,
     savingDocument: false,
 
     inputFiles: [],
@@ -254,9 +258,33 @@ function opencrApp() {
       return this.documents.find(d => d.id === this.selectedDocumentId) || null;
     },
 
+    availableDocumentGroups() {
+      return [...new Set(this.documents.map(d => (d.group_path || '').trim()).filter(Boolean))].sort();
+    },
+
+    filteredDocuments() {
+      const query = this.documentSearch.trim().toLowerCase();
+      return this.documents.filter((doc) => {
+        const group = (doc.group_path || '').trim();
+        if (this.documentGroupFilter && group !== this.documentGroupFilter) return false;
+        if (!query) return true;
+        return [
+          doc.display_title,
+          doc.filename,
+          doc.group_path,
+          doc.author,
+          doc.work,
+          doc.book,
+          doc.document_date_label,
+          doc.language,
+          doc.script,
+        ].some(value => String(value || '').toLowerCase().includes(query));
+      });
+    },
+
     groupedDocuments() {
       const groups = new Map();
-      for (const doc of this.documents) {
+      for (const doc of this.filteredDocuments()) {
         const name = (doc.group_path || '').trim() || 'Ungrouped';
         if (!groups.has(name)) groups.set(name, []);
         groups.get(name).push(doc);
@@ -277,11 +305,17 @@ function opencrApp() {
     },
 
     selectAllDocuments(checked) {
-      this.selectedDocumentIds = checked ? this.documents.map(d => d.id) : [];
+      const visibleIds = this.filteredDocuments().map(d => d.id);
+      if (!checked) {
+        this.selectedDocumentIds = this.selectedDocumentIds.filter(id => !visibleIds.includes(id));
+        return;
+      }
+      this.selectedDocumentIds = [...new Set([...this.selectedDocumentIds, ...visibleIds])];
     },
 
     get allDocumentsSelected() {
-      return this.documents.length > 0 && this.selectedDocumentIds.length === this.documents.length;
+      const visibleIds = this.filteredDocuments().map(d => d.id);
+      return visibleIds.length > 0 && visibleIds.every(id => this.selectedDocumentIds.includes(id));
     },
 
     selectedDocumentPaths() {
@@ -318,6 +352,24 @@ function opencrApp() {
       }
     },
 
+    async applyBulkGroup() {
+      if (this.selectedDocumentIds.length === 0 || this.bulkGrouping) return;
+      this.bulkGrouping = true;
+      try {
+        await API.bulkUpdateDocuments({
+          document_ids: this.selectedDocumentIds,
+          group_path: this.bulkGroupPath || null,
+        });
+        await this.refreshDocuments();
+        if (this.selectedDocumentId) this.selectDocument(this.selectedDocumentId);
+        this.toast('Group updated', 'success');
+      } catch (e) {
+        this.toast(`Group update failed: ${e.message}`, 'error');
+      } finally {
+        this.bulkGrouping = false;
+      }
+    },
+
     async startDocumentsRun() {
       const paths = this.selectedDocumentPaths();
       if (paths.length === 0) return this.toast('Select documents first', 'error');
@@ -413,6 +465,10 @@ function opencrApp() {
       if (this.selectedRunId) this._download(API.datasetDownloadUrl(this.selectedRunId));
     },
 
+    downloadOCRPairs() {
+      if (this.selectedRunId) this._download(API.ocrPairsDownloadUrl(this.selectedRunId));
+    },
+
     openHFModal() {
       if (!this.selectedRunId) return;
       if (!this.canPublish) {
diff --git a/tests/test_ocr_pair_exporter.py b/tests/test_ocr_pair_exporter.py
new file mode 100644
index 0000000..8ba84c3
--- /dev/null
+++ b/tests/test_ocr_pair_exporter.py
@@ -0,0 +1,117 @@
+import json
+import zipfile
+
+from PIL import Image
+
+from ocr_pipeline.services.ocr_pair_exporter import OCRPairExporter
+from ocr_pipeline.services.output_writer import OutputWriter
+from ocr_pipeline.services.run_storage import RunStorage
+from tests.test_output_writer import build_document, build_script
+
+
+class FakeRenderer:
+    def render_page(self, _pdf_path, page_num, _dpi):
+        return Image.new("RGB", (16, 16), color=(page_num * 40, 20, 20))
+
+
+def test_ocr_pair_export_writes_images_jsonl_and_manifest(tmp_path):
+    storage = RunStorage(output_root=tmp_path, runs_root=tmp_path / "runs")
+    storage.ensure_run_dirs("run-1234")
+
+    document = build_document()
+    document_id = document.file_sha256[:16]
+    paths = storage.artifact_paths("run-1234", document_id, document.filename)
+    paths.source_pdf.write_bytes(b"%PDF-1.4\n")
+
+    OutputWriter().write_all(
+        paths=paths,
+        raw_pages_text=["raw page one", "raw page two"],
+        clean_pages_text=["clean page one", "clean page two"],
+        pages_metadata=document.pages,
+        pages_script=[build_script(), build_script()],
+        doc_metadata=document,
+    )
+
+    exporter = OCRPairExporter(
+        storage.dataset_dir("run-1234") / "ocr_pairs", renderer=FakeRenderer()
+    )
+    result = exporter.export_run(
+        run={
+            "id": "run-1234",
+            "model_used": "deepseek-ai/DeepSeek-OCR",
+            "pipeline_version": "2.0.0",
+        },
+        documents=[
+            {
+                "document_id": document_id,
+                "document_filename": document.filename,
+                "status": "completed",
+                "total_pages": 2,
+                "file_sha256": document.file_sha256,
+                "artifact_source_pdf": str(paths.source_pdf),
+                "artifact_raw_txt": str(paths.raw_txt),
+                "artifact_clean_txt": str(paths.clean_txt),
+            }
+        ],
+        pages_by_document={
+            document_id: [
+                {
+                    "page_num": 1,
+                    "status": "pass",
+                    "detected_languages": json.dumps(["ota-Latn"]),
+                    "validation_issues": "[]",
+                    "primary_script": "latin_extended",
+                    "extraction_mode": "markdown",
+                    "extraction_attempt": 1,
+                    "dpi_used": 160,
+                },
+                {
+                    "page_num": 2,
+                    "status": "pass",
+                    "detected_languages": json.dumps(["ota-Latn"]),
+                    "validation_issues": "[]",
+                    "primary_script": "latin_extended",
+                    "extraction_mode": "free_ocr",
+                    "extraction_attempt": 2,
+                    "dpi_used": 160,
+                },
+            ]
+        },
+        catalog_by_document={
+            document_id: {
+                "group_path": "Ottoman/Seyahatname",
+                "author": "Evliyâ Çelebi",
+                "work": "Seyahatnâme",
+                "language": "ota-Latn,tr",
+                "script": "latin_extended",
+            }
+        },
+    )
+
+    assert result.pages_count == 2
+    assert result.bundle.exists()
+
+    rows = []
+    for split in ("train", "validation", "test"):
+        rows.extend(
+            json.loads(line)
+            for line in (result.export_dir / f"{split}.jsonl")
+            .read_text(encoding="utf-8")
+            .splitlines()
+        )
+    assert [row["text"] for row in rows] == ["clean page one", "clean page two"]
+    assert rows[0]["image"].startswith("images/")
+    assert rows[0]["group_path"] == "Ottoman/Seyahatname"
+    assert rows[1]["extraction_mode"] == "free_ocr"
+
+    manifest = json.loads(
+        (result.export_dir / "manifest.json").read_text(encoding="utf-8")
+    )
+    assert manifest["export_type"] == "ocr_pairs"
+    assert manifest["created_by"]["organization"] == "cdli.ai"
+
+    with zipfile.ZipFile(result.bundle) as archive:
+        names = set(archive.namelist())
+    assert "manifest.json" in names
+    assert "train.jsonl" in names
+    assert any(name.startswith("images/") and name.endswith(".png") for name in names)
diff --git a/tests/test_ui_routes.py b/tests/test_ui_routes.py
index f0d8355..aa11e99 100644
--- a/tests/test_ui_routes.py
+++ b/tests/test_ui_routes.py
@@ -65,6 +65,53 @@ async def fake_wait_for_model_server():
         assert patch.json()["metadata_complete"] is True
         assert patch.json()["group_path"] == "Ottoman/Seyahatname"
 
+        bulk = client.patch(
+            "/api/documents/bulk",
+            json={"document_ids": [docs[0]["id"]], "group_path": "Grouped/Batch"},
+        )
+        assert bulk.status_code == 200
+        assert bulk.json()[0]["group_path"] == "Grouped/Batch"
+
+
+def test_duplicate_upload_filenames_keep_distinct_source_paths(tmp_path, monkeypatch):
+    input_dir = tmp_path / "input"
+    output_dir = tmp_path / "output"
+    runs_dir = output_dir / "runs"
+    db_path = output_dir / "opencr.sqlite"
+    input_dir.mkdir()
+    output_dir.mkdir()
+
+    async def fake_wait_for_model_server():
+        model_readiness.ready = True
+        return True
+
+    monkeypatch.setattr(settings, "input_dir", input_dir)
+    monkeypatch.setattr(settings, "output_dir", output_dir)
+    monkeypatch.setattr(settings, "runs_dir", runs_dir)
+    monkeypatch.setattr(settings, "db_path", db_path)
+    monkeypatch.setattr(
+        main_module, "wait_for_model_server", fake_wait_for_model_server
+    )
+    model_readiness.ready = True
+
+    with TestClient(main_module.app) as client:
+        first = client.post(
+            "/api/upload",
+            files={"file": ("sample.pdf", b"%PDF-1.4 first\n", "application/pdf")},
+        )
+        second = client.post(
+            "/api/upload",
+            files={"file": ("sample.pdf", b"%PDF-1.4 second\n", "application/pdf")},
+        )
+        assert first.status_code == 200
+        assert second.status_code == 200
+        assert first.json()["path"] != second.json()["path"]
+        assert Path(first.json()["path"]).exists()
+        assert Path(second.json()["path"]).exists()
+
+        docs = client.get("/api/documents").json()
+        assert len({doc["source_path"] for doc in docs}) == 2
+
 
 def test_runs_list_empty_when_no_runs(tmp_path, monkeypatch):
     output_dir = tmp_path / "output"
@@ -113,7 +160,14 @@ def test_home_uses_document_workbench():
     assert "document_date_precision" in html
     assert "group_path" in html
     assert "OCR snapshot" in html
+    assert "OCR pairs" in html
     assert "selectedDocumentIds" in app_js
+    assert "availableDocumentGroups()" in app_js
+    assert "filteredDocuments()" in app_js
     assert "groupedDocuments()" in app_js
+    assert "applyBulkGroup()" in app_js
+    assert "downloadOCRPairs()" in app_js
+    assert "selectedRunDocumentIds" in app_js
+    assert "documentProcessLabel" in app_js
     assert "selectedPageText()" in app_js
     assert "saveSelectedDocument()" in app_js

From 565eec9466f2bdb11d1882826491e552a4bc07d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= <fatihburak@protonmail.com>
Date: Mon, 11 May 2026 21:45:00 +0300
Subject: [PATCH 05/15] feat: support partial dataset downloads and cached OCR
 pair exports with file hashing

---
 ocr_pipeline/routers/documents.py          |  25 ++--
 ocr_pipeline/routers/runs.py               |  31 ++++-
 ocr_pipeline/routers/ui.py                 |  17 ++-
 ocr_pipeline/services/db.py                |  34 ++++++
 ocr_pipeline/services/ocr_pair_exporter.py |  54 ++++++--
 ocr_pipeline/static/index.html             |  13 +-
 ocr_pipeline/static/js/api.js              |   6 +-
 ocr_pipeline/static/js/app.js              |  38 +++++-
 tests/test_document_catalog.py             |  31 +++++
 tests/test_ocr_pair_exporter.py            | 136 ++++++++++++++++++++-
 10 files changed, 352 insertions(+), 33 deletions(-)

diff --git a/ocr_pipeline/routers/documents.py b/ocr_pipeline/routers/documents.py
index b301e17..df4d731 100644
--- a/ocr_pipeline/routers/documents.py
+++ b/ocr_pipeline/routers/documents.py
@@ -34,19 +34,20 @@ async def update_documents_bulk(payload: BulkDocumentUpdate):
     if not payload.document_ids:
         raise HTTPException(status_code=400, detail="document_ids must not be empty")
     db = get_db()
-    for document_id in payload.document_ids:
-        try:
-            await db.update_document_metadata(
-                document_id,
-                group_path=payload.group_path,
-            )
-        except KeyError:
-            raise HTTPException(
-                status_code=404, detail=f"Document not found: {document_id}"
-            )
+    try:
+        await db.update_documents_metadata(
+            payload.document_ids,
+            group_path=payload.group_path,
+        )
+    except KeyError as exc:
+        raise HTTPException(
+            status_code=404, detail=f"Document not found: {exc.args[0]}"
+        )
     documents = await db.list_documents(limit=1000)
-    selected = {document_id for document_id in payload.document_ids}
-    return [_document_summary(doc) for doc in documents if doc["id"] in selected]
+    by_id = {doc["id"]: doc for doc in documents}
+    return [
+        _document_summary(by_id[document_id]) for document_id in payload.document_ids
+    ]
 
 
 @router.get("/api/documents/{document_id}", response_model=DocumentSummary)
diff --git a/ocr_pipeline/routers/runs.py b/ocr_pipeline/routers/runs.py
index d6b2e1e..e003d8d 100644
--- a/ocr_pipeline/routers/runs.py
+++ b/ocr_pipeline/routers/runs.py
@@ -1,4 +1,5 @@
 import asyncio
+import hashlib
 import json
 from io import BytesIO
 from pathlib import Path
@@ -310,6 +311,7 @@ async def download_ocr_pairs(
     run_id: str = ID,
     dpi: int = Query(160, ge=50, le=400),
     text_mode: str = Query("clean", pattern="^(clean|raw)$"),
+    document_ids: str | None = Query(None),
 ):
     db = get_db()
     run = await _require_run(run_id)
@@ -317,6 +319,17 @@ async def download_ocr_pairs(
         raise HTTPException(status_code=409, detail="Run is not yet completed")
 
     documents = await db.list_run_documents(run_id)
+    selected_ids = {
+        part.strip() for part in (document_ids or "").split(",") if part.strip()
+    } or None
+    if selected_ids:
+        available_ids = {doc["document_id"] for doc in documents}
+        missing = selected_ids - available_ids
+        if missing:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Document not found in run: {sorted(missing)[0]}",
+            )
     pages_by_document = {
         doc["document_id"]: await db.list_pages(run_id, doc["document_id"])
         for doc in documents
@@ -325,13 +338,29 @@ async def download_ocr_pairs(
         doc["document_id"]: await db.get_document(doc["document_id"]) or {}
         for doc in documents
     }
-    exporter = OCRPairExporter(settings.runs_dir / run_id / "dataset" / "ocr_pairs")
+    scope = "all"
+    if selected_ids:
+        scope = hashlib.sha256(
+            ",".join(sorted(selected_ids)).encode("utf-8")
+        ).hexdigest()[:12]
+    export_dir = (
+        settings.runs_dir / run_id / "dataset" / f"ocr_pairs_{text_mode}_{dpi}_{scope}"
+    )
+    cached_bundle = export_dir.with_suffix(".zip")
+    if cached_bundle.exists():
+        return FileResponse(
+            cached_bundle,
+            media_type="application/zip",
+            filename=f"{run_id}-ocr-pairs.zip",
+        )
+    exporter = OCRPairExporter(export_dir)
     result = await asyncio.to_thread(
         exporter.export_run,
         run=run,
         documents=documents,
         pages_by_document=pages_by_document,
         catalog_by_document=catalog_by_document,
+        document_ids=selected_ids,
         dpi=dpi,
         text_mode=text_mode,
     )
diff --git a/ocr_pipeline/routers/ui.py b/ocr_pipeline/routers/ui.py
index a66da74..dfe7ada 100644
--- a/ocr_pipeline/routers/ui.py
+++ b/ocr_pipeline/routers/ui.py
@@ -1,6 +1,7 @@
 """Static-friendly endpoints for input file management. Output/dataset listing
 moved to /api/runs."""
 
+import hashlib
 from pathlib import Path
 
 from fastapi import APIRouter, HTTPException, UploadFile
@@ -25,12 +26,18 @@ async def upload_pdf(file: UploadFile):
         raise HTTPException(status_code=400, detail="Invalid filename")
 
     settings.input_dir.mkdir(parents=True, exist_ok=True)
-    dest = settings.input_dir / safe_name
     content = await file.read()
+    digest = hashlib.sha256(content).hexdigest()
+    dest = settings.input_dir / f"{digest[:16]}__{safe_name}"
     dest.write_bytes(content)
     await catalog_pdf(get_db(), dest, filename=safe_name)
 
-    return {"filename": safe_name, "size": len(content), "path": str(dest)}
+    return {
+        "filename": safe_name,
+        "stored_filename": dest.name,
+        "size": len(content),
+        "path": str(dest),
+    }
 
 
 @router.get("/api/files/input", response_model=list[FileInfo])
@@ -40,13 +47,17 @@ async def list_input_files():
     if not input_dir.exists():
         return []
 
+    documents_by_path = {
+        doc["source_path"]: doc for doc in await get_db().list_documents(limit=1000)
+    }
     files = []
     for p in sorted(input_dir.iterdir()):
         if p.is_file() and p.suffix.lower() == ".pdf":
             stat = p.stat()
+            document = documents_by_path.get(str(p))
             files.append(
                 FileInfo(
-                    name=p.name,
+                    name=document["filename"] if document else p.name,
                     size=stat.st_size,
                     modified=stat.st_mtime,
                     path=str(p),
diff --git a/ocr_pipeline/services/db.py b/ocr_pipeline/services/db.py
index 90dccc4..d3337d6 100644
--- a/ocr_pipeline/services/db.py
+++ b/ocr_pipeline/services/db.py
@@ -431,6 +431,40 @@ async def update_document_metadata(
             raise KeyError(document_id)
         return doc
 
+    async def update_documents_metadata(
+        self, document_ids: list[str], **fields: Any
+    ) -> list[dict[str, Any]]:
+        if not document_ids:
+            return []
+        clean = {k: v for k, v in fields.items() if k in DOCUMENT_METADATA_FIELDS}
+        placeholders = ", ".join("?" for _ in document_ids)
+        async with self.conn.execute(
+            f"SELECT id FROM documents WHERE id IN ({placeholders})",
+            document_ids,
+        ) as cur:
+            existing = {row["id"] for row in await cur.fetchall()}
+        missing = [
+            document_id for document_id in document_ids if document_id not in existing
+        ]
+        if missing:
+            raise KeyError(missing[0])
+        if clean:
+            clean["catalog_updated_at"] = _now()
+            cols = ", ".join(f"{k} = ?" for k in clean)
+            values = [*clean.values()]
+            for document_id in document_ids:
+                await self.conn.execute(
+                    f"UPDATE documents SET {cols} WHERE id = ?",
+                    [*values, document_id],
+                )
+            await self.conn.commit()
+        docs = []
+        for document_id in document_ids:
+            doc = await self.get_document(document_id)
+            if doc:
+                docs.append(doc)
+        return docs
+
     async def list_document_runs(self, document_id: str) -> list[dict[str, Any]]:
         async with self.conn.execute(
             """
diff --git a/ocr_pipeline/services/ocr_pair_exporter.py b/ocr_pipeline/services/ocr_pair_exporter.py
index a4cdb40..17e3a13 100644
--- a/ocr_pipeline/services/ocr_pair_exporter.py
+++ b/ocr_pipeline/services/ocr_pair_exporter.py
@@ -1,6 +1,7 @@
 import hashlib
 import json
 import shutil
+import tempfile
 import zipfile
 from dataclasses import dataclass
 from pathlib import Path
@@ -45,6 +46,8 @@ def _split_name(stable_key: str) -> str:
 
     @staticmethod
     def _json_list(raw: str | None) -> list[str]:
+        if isinstance(raw, list):
+            return [str(item) for item in raw]
         if not raw:
             return []
         try:
@@ -68,15 +71,19 @@ def export_run(
         documents: list[dict],
         pages_by_document: dict[str, list[dict]],
         catalog_by_document: dict[str, dict],
+        document_ids: set[str] | None = None,
         dpi: int = 160,
         text_mode: str = "clean",
     ) -> OCRPairExportResult:
         if text_mode not in {"clean", "raw"}:
             raise ValueError("text_mode must be clean or raw")
 
-        if self.export_dir.exists():
-            shutil.rmtree(self.export_dir)
-        images_dir = self.export_dir / "images"
+        tmp_parent = self.export_dir.parent
+        tmp_parent.mkdir(parents=True, exist_ok=True)
+        tmp_path = Path(
+            tempfile.mkdtemp(prefix=f"{self.export_dir.name}.", dir=tmp_parent)
+        )
+        images_dir = tmp_path / "images"
         images_dir.mkdir(parents=True, exist_ok=True)
 
         split_rows: dict[str, list[dict]] = {"train": [], "validation": [], "test": []}
@@ -86,6 +93,8 @@ def export_run(
             if doc.get("status") != "completed":
                 continue
             document_id = doc["document_id"]
+            if document_ids is not None and document_id not in document_ids:
+                continue
             catalog = catalog_by_document.get(document_id, {})
             pdf_path_str = doc.get("artifact_source_pdf") or doc.get(
                 "document_source_path"
@@ -121,16 +130,24 @@ def export_run(
                 raw_text = raw_pages[page_num - 1]
                 clean_text = clean_pages[page_num - 1]
                 text = clean_text if text_mode == "clean" else raw_text
-                split = self._split_name(page_id)
+                split_key = doc.get("file_sha256") or document_id
+                split = self._split_name(split_key)
+                image_path = images_dir / f"{page_id}.png"
+                image_hash = hashlib.sha256(image_path.read_bytes()).hexdigest()
 
                 split_rows[split].append(
                     {
                         "id": page_id,
+                        "run_id": run["id"],
                         "image": image_rel,
                         "text": text,
                         "raw_text": raw_text,
                         "clean_text": clean_text,
                         "text_mode": text_mode,
+                        "label_source": "cleaned_machine_ocr"
+                        if text_mode == "clean"
+                        else "machine_ocr",
+                        "review_status": "unreviewed",
                         "document_id": document_id,
                         "document_name": doc.get("document_filename"),
                         "page": page_num,
@@ -155,6 +172,10 @@ def export_run(
                         "extraction_mode": page_meta.get("extraction_mode"),
                         "extraction_attempt": page_meta.get("extraction_attempt"),
                         "dpi_used": page_meta.get("dpi_used"),
+                        "render_dpi": dpi,
+                        "image_width": image.width,
+                        "image_height": image.height,
+                        "image_sha256": image_hash,
                         "source_file": doc.get("document_filename"),
                         "source_pdf_sha256": doc.get("file_sha256"),
                         "ocr_model": run.get("model_used"),
@@ -163,8 +184,11 @@ def export_run(
                 )
                 pages_count += 1
 
-        self._write_jsonl(split_rows)
-        self._write_manifest(run, pages_count, dpi, text_mode)
+        self._write_jsonl(tmp_path, split_rows)
+        self._write_manifest(tmp_path, run, pages_count, dpi, text_mode)
+        if self.export_dir.exists():
+            shutil.rmtree(self.export_dir)
+        tmp_path.replace(self.export_dir)
         bundle = self.export_dir.with_suffix(".zip")
         if bundle.exists():
             bundle.unlink()
@@ -181,16 +205,16 @@ def _read_text(path_str: str | None) -> str:
         path = Path(path_str)
         return path.read_text(encoding="utf-8") if path.exists() else ""
 
-    def _write_jsonl(self, split_rows: dict[str, list[dict]]) -> None:
+    def _write_jsonl(self, export_dir: Path, split_rows: dict[str, list[dict]]) -> None:
         for split, rows in split_rows.items():
-            path = self.export_dir / f"{split}.jsonl"
+            path = export_dir / f"{split}.jsonl"
             path.write_text(
                 "".join(json.dumps(row, ensure_ascii=False) + "\n" for row in rows),
                 encoding="utf-8",
             )
 
     def _write_manifest(
-        self, run: dict, pages_count: int, dpi: int, text_mode: str
+        self, export_dir: Path, run: dict, pages_count: int, dpi: int, text_mode: str
     ) -> None:
         payload = {
             "export_type": "ocr_pairs",
@@ -200,13 +224,23 @@ def _write_manifest(
             "image_format": "png",
             "dpi": dpi,
             "text_mode": text_mode,
+            "dataset_purpose": "ocr_audit",
+            "label_source": "cleaned_machine_ocr"
+            if text_mode == "clean"
+            else "machine_ocr",
+            "review_status": "unreviewed",
             "schema_version": 1,
+            "split_strategy": {
+                "method": "sha256_bucket",
+                "key": "source_pdf_sha256",
+                "ratios": {"train": 0.90, "validation": 0.05, "test": 0.05},
+            },
             "ocr_model": run.get("model_used") or settings.model_name,
             "pipeline_version": run.get("pipeline_version")
             or settings.pipeline_version,
             "splits": ["train", "validation", "test"],
         }
-        (self.export_dir / "manifest.json").write_text(
+        (export_dir / "manifest.json").write_text(
             json.dumps(payload, indent=2, ensure_ascii=False),
             encoding="utf-8",
         )
diff --git a/ocr_pipeline/static/index.html b/ocr_pipeline/static/index.html
index 412fb1e..303f5a8 100644
--- a/ocr_pipeline/static/index.html
+++ b/ocr_pipeline/static/index.html
@@ -170,7 +170,9 @@ <h2>Documents</h2>
                     <span class="pill pill-sm"
                           :class="doc.metadata_complete ? 'pill-success' : 'pill-warn'"
                           x-text="doc.metadata_complete ? 'ready' : 'missing'"></span>
-                    <span x-text="doc.latest_run_status || 'never'"></span>
+                    <span class="pill pill-sm"
+                          :class="documentProcessClass(doc)"
+                          x-text="documentProcessLabel(doc)"></span>
                   </div>
                 </template>
               </div>
@@ -258,7 +260,7 @@ <h2>Run <code x-text="selectedRun?.id"></code></h2>
           ↓ Dataset bundle (zip)
         </button>
         <button class="btn btn-ghost" @click="downloadOCRPairs">
-          ↓ OCR pairs (zip)
+          <span x-text="selectedRunDocumentIds.length ? `↓ OCR pairs (${selectedRunDocumentIds.length})` : '↓ OCR pairs (zip)'"></span>
         </button>
         <button class="btn btn-primary"
                 @click="openHFModal"
@@ -280,7 +282,12 @@ <h3 class="section-title">Documents</h3>
                 :class="{ active: inspector.documentId === doc.document_id }"
                 @click="openDocument(doc.document_id)">
               <div class="doc-head">
-                <span class="doc-name" x-text="doc.filename"></span>
+                <label class="checkbox-row" @click.stop>
+                  <input type="checkbox"
+                         :checked="selectedRunDocumentIds.includes(doc.document_id)"
+                         @change="toggleRunDocument(doc.document_id)">
+                  <span class="doc-name" x-text="doc.filename"></span>
+                </label>
                 <span class="pill pill-sm"
                       :class="runStatusClass(doc.status)"
                       x-text="doc.status"></span>
diff --git a/ocr_pipeline/static/js/api.js b/ocr_pipeline/static/js/api.js
index 37b4842..66cc5a2 100644
--- a/ocr_pipeline/static/js/api.js
+++ b/ocr_pipeline/static/js/api.js
@@ -143,8 +143,10 @@ const API = {
     return `/api/runs/${encodeURIComponent(runId)}/dataset/download`;
   },
 
-  ocrPairsDownloadUrl(runId, { dpi = 160, textMode = 'clean' } = {}) {
-    return `/api/runs/${encodeURIComponent(runId)}/ocr-pairs/download?dpi=${dpi}&text_mode=${encodeURIComponent(textMode)}`;
+  ocrPairsDownloadUrl(runId, { dpi = 160, textMode = 'clean', documentIds = [] } = {}) {
+    const params = new URLSearchParams({ dpi: String(dpi), text_mode: textMode });
+    if (documentIds.length > 0) params.set('document_ids', documentIds.join(','));
+    return `/api/runs/${encodeURIComponent(runId)}/ocr-pairs/download?${params.toString()}`;
   },
 
   async publishToHF(runId, payload) {
diff --git a/ocr_pipeline/static/js/app.js b/ocr_pipeline/static/js/app.js
index 9b18b20..c7c50f2 100644
--- a/ocr_pipeline/static/js/app.js
+++ b/ocr_pipeline/static/js/app.js
@@ -31,6 +31,7 @@ function opencrApp() {
     runs: [],
     selectedRunId: null,
     selectedRun: null,
+    selectedRunDocumentIds: [],
 
     documents: [],
     selectedDocumentIds: [],
@@ -136,12 +137,14 @@ function opencrApp() {
       if (!runId) {
         this.selectedRunId = null;
         this.selectedRun = null;
+        this.selectedRunDocumentIds = [];
         this.inspector = emptyInspector();
         return;
       }
       this.selectedRunId = runId;
       try {
         this.selectedRun = await API.getRun(runId);
+        this.selectedRunDocumentIds = [];
         const firstCompleted = (this.selectedRun.documents || []).find(d => d.status === 'completed');
         if (firstCompleted) await this.openDocument(firstCompleted.document_id);
         else this.inspector = emptyInspector();
@@ -254,6 +257,28 @@ function opencrApp() {
     pageStatusClass(status) { return PAGE_STATUS[status] || 'page-pending'; },
     runStatusClass(status) { return STATUS_PILL[status] || 'pill-muted'; },
 
+    documentProcessLabel(doc) {
+      const status = doc.latest_run_status;
+      if (status === 'completed') return 'processed';
+      if (['queued', 'processing'].includes(status)) return 'running';
+      if (status === 'failed') return 'failed';
+      return 'never';
+    },
+
+    documentProcessClass(doc) {
+      const status = doc.latest_run_status;
+      if (status === 'completed') return 'pill-success';
+      if (['queued', 'processing'].includes(status)) return 'pill-active';
+      if (status === 'failed') return 'pill-error';
+      return 'pill-muted';
+    },
+
+    toggleRunDocument(documentId) {
+      const i = this.selectedRunDocumentIds.indexOf(documentId);
+      if (i === -1) this.selectedRunDocumentIds.push(documentId);
+      else this.selectedRunDocumentIds.splice(i, 1);
+    },
+
     selectedDocument() {
       return this.documents.find(d => d.id === this.selectedDocumentId) || null;
     },
@@ -373,6 +398,12 @@ function opencrApp() {
     async startDocumentsRun() {
       const paths = this.selectedDocumentPaths();
       if (paths.length === 0) return this.toast('Select documents first', 'error');
+      const alreadyProcessed = this.documents.filter(
+        d => this.selectedDocumentIds.includes(d.id) && d.latest_run_status === 'completed',
+      );
+      if (alreadyProcessed.length > 0 && !confirm(`${alreadyProcessed.length} selected document(s) were already processed. Start a new run anyway?`)) {
+        return;
+      }
       this.selectedPaths = paths;
       await this.startNewRun();
     },
@@ -466,7 +497,12 @@ function opencrApp() {
     },
 
     downloadOCRPairs() {
-      if (this.selectedRunId) this._download(API.ocrPairsDownloadUrl(this.selectedRunId));
+      if (this.selectedRunId) {
+        this._download(API.ocrPairsDownloadUrl(
+          this.selectedRunId,
+          { documentIds: this.selectedRunDocumentIds },
+        ));
+      }
     },
 
     openHFModal() {
diff --git a/tests/test_document_catalog.py b/tests/test_document_catalog.py
index 1b75bc6..0df8a98 100644
--- a/tests/test_document_catalog.py
+++ b/tests/test_document_catalog.py
@@ -43,3 +43,34 @@ async def _scenario():
             await db.close()
 
     asyncio.run(_scenario())
+
+
+def test_bulk_document_metadata_validates_before_writing(tmp_path):
+    async def _scenario():
+        db = Database(tmp_path / "opencr.sqlite")
+        await db.connect()
+        try:
+            await db.upsert_document(
+                "doc-1",
+                filename="source.pdf",
+                source_path="/tmp/source.pdf",
+                file_sha256="abc",
+                file_size_bytes=123,
+            )
+
+            try:
+                await db.update_documents_metadata(
+                    ["doc-1", "missing"],
+                    group_path="Should/Not/Write",
+                )
+            except KeyError:
+                pass
+            else:
+                raise AssertionError("expected missing document to fail")
+
+            doc = await db.get_document("doc-1")
+            assert doc["group_path"] is None
+        finally:
+            await db.close()
+
+    asyncio.run(_scenario())
diff --git a/tests/test_ocr_pair_exporter.py b/tests/test_ocr_pair_exporter.py
index 8ba84c3..c641a9f 100644
--- a/tests/test_ocr_pair_exporter.py
+++ b/tests/test_ocr_pair_exporter.py
@@ -6,7 +6,7 @@
 from ocr_pipeline.services.ocr_pair_exporter import OCRPairExporter
 from ocr_pipeline.services.output_writer import OutputWriter
 from ocr_pipeline.services.run_storage import RunStorage
-from tests.test_output_writer import build_document, build_script
+from tests.test_output_writer import build_document, build_page, build_script
 
 
 class FakeRenderer:
@@ -102,6 +102,11 @@ def test_ocr_pair_export_writes_images_jsonl_and_manifest(tmp_path):
     assert [row["text"] for row in rows] == ["clean page one", "clean page two"]
     assert rows[0]["image"].startswith("images/")
     assert rows[0]["group_path"] == "Ottoman/Seyahatname"
+    assert rows[0]["label_source"] == "cleaned_machine_ocr"
+    assert rows[0]["review_status"] == "unreviewed"
+    assert rows[0]["run_id"] == "run-1234"
+    assert rows[0]["image_sha256"]
+    assert rows[0]["image_width"] == 16
     assert rows[1]["extraction_mode"] == "free_ocr"
 
     manifest = json.loads(
@@ -115,3 +120,132 @@ def test_ocr_pair_export_writes_images_jsonl_and_manifest(tmp_path):
     assert "manifest.json" in names
     assert "train.jsonl" in names
     assert any(name.startswith("images/") and name.endswith(".png") for name in names)
+
+
+def test_ocr_pair_export_splits_all_pages_from_document_together(tmp_path):
+    storage = RunStorage(output_root=tmp_path, runs_root=tmp_path / "runs")
+    storage.ensure_run_dirs("run-many")
+
+    document = build_document()
+    document.pages = [build_page(i) for i in range(1, 41)]
+    document.total_pages = 40
+    document.file_sha256 = "split-doc-sha"
+    document_id = "splitdoc"
+    paths = storage.artifact_paths("run-many", document_id, document.filename)
+    paths.source_pdf.write_bytes(b"%PDF-1.4\n")
+    OutputWriter().write_all(
+        paths=paths,
+        raw_pages_text=[f"raw {i}" for i in range(1, 41)],
+        clean_pages_text=[f"clean {i}" for i in range(1, 41)],
+        pages_metadata=document.pages,
+        pages_script=[build_script() for _ in range(40)],
+        doc_metadata=document,
+    )
+
+    result = OCRPairExporter(
+        storage.dataset_dir("run-many") / "ocr_pairs", renderer=FakeRenderer()
+    ).export_run(
+        run={"id": "run-many", "model_used": "model", "pipeline_version": "2.0.0"},
+        documents=[
+            {
+                "document_id": document_id,
+                "document_filename": document.filename,
+                "status": "completed",
+                "total_pages": 40,
+                "file_sha256": document.file_sha256,
+                "artifact_source_pdf": str(paths.source_pdf),
+                "artifact_raw_txt": str(paths.raw_txt),
+                "artifact_clean_txt": str(paths.clean_txt),
+            }
+        ],
+        pages_by_document={
+            document_id: [
+                {
+                    "page_num": page.page_num,
+                    "status": "pass",
+                    "detected_languages": json.dumps(["en"]),
+                    "validation_issues": "[]",
+                    "primary_script": "latin",
+                    "extraction_mode": "markdown",
+                    "extraction_attempt": 1,
+                    "dpi_used": 160,
+                }
+                for page in document.pages
+            ]
+        },
+        catalog_by_document={document_id: {}},
+    )
+
+    splits_with_rows = [
+        split
+        for split in ("train", "validation", "test")
+        if (result.export_dir / f"{split}.jsonl").read_text(encoding="utf-8").strip()
+    ]
+    assert len(splits_with_rows) == 1
+
+
+def test_ocr_pair_export_can_filter_selected_documents(tmp_path):
+    storage = RunStorage(output_root=tmp_path, runs_root=tmp_path / "runs")
+    storage.ensure_run_dirs("run-selected")
+    document = build_document()
+    paths = storage.artifact_paths("run-selected", "doc-a", document.filename)
+    paths.source_pdf.write_bytes(b"%PDF-1.4\n")
+    OutputWriter().write_all(
+        paths=paths,
+        raw_pages_text=["raw page one", "raw page two"],
+        clean_pages_text=["clean page one", "clean page two"],
+        pages_metadata=document.pages,
+        pages_script=[build_script(), build_script()],
+        doc_metadata=document,
+    )
+
+    result = OCRPairExporter(
+        storage.dataset_dir("run-selected") / "ocr_pairs", renderer=FakeRenderer()
+    ).export_run(
+        run={"id": "run-selected", "model_used": "model", "pipeline_version": "2.0.0"},
+        documents=[
+            {
+                "document_id": "doc-a",
+                "document_filename": "a.pdf",
+                "status": "completed",
+                "total_pages": 2,
+                "file_sha256": "doc-a-sha",
+                "artifact_source_pdf": str(paths.source_pdf),
+                "artifact_raw_txt": str(paths.raw_txt),
+                "artifact_clean_txt": str(paths.clean_txt),
+            },
+            {
+                "document_id": "doc-b",
+                "document_filename": "b.pdf",
+                "status": "completed",
+                "total_pages": 2,
+                "file_sha256": "doc-b-sha",
+                "artifact_source_pdf": str(paths.source_pdf),
+                "artifact_raw_txt": str(paths.raw_txt),
+                "artifact_clean_txt": str(paths.clean_txt),
+            },
+        ],
+        pages_by_document={
+            "doc-a": [
+                {"page_num": 1, "status": "pass"},
+                {"page_num": 2, "status": "pass"},
+            ],
+            "doc-b": [
+                {"page_num": 1, "status": "pass"},
+                {"page_num": 2, "status": "pass"},
+            ],
+        },
+        catalog_by_document={"doc-a": {}, "doc-b": {}},
+        document_ids={"doc-a"},
+    )
+
+    rows = []
+    for split in ("train", "validation", "test"):
+        rows.extend(
+            json.loads(line)
+            for line in (result.export_dir / f"{split}.jsonl")
+            .read_text(encoding="utf-8")
+            .splitlines()
+        )
+    assert result.pages_count == 2
+    assert {row["document_id"] for row in rows} == {"doc-a"}

From b7e998c693baf9ec4c7bcae2ec14a3ea7bc167e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= <fatihburak@protonmail.com>
Date: Mon, 11 May 2026 21:54:03 +0300
Subject: [PATCH 06/15] feat: add retry functionality for failed runs and
 ensure incomplete documents are marked as failed on run failure

---
 ocr_pipeline/main.py                      |   3 +
 ocr_pipeline/routers/runs.py              |  31 +++++++
 ocr_pipeline/services/db.py               |  35 ++++++++
 ocr_pipeline/services/run_orchestrator.py |  31 +++++++
 ocr_pipeline/static/index.html            |   4 +
 ocr_pipeline/static/js/api.js             |   9 ++
 ocr_pipeline/static/js/app.js             |  15 ++++
 tests/test_db_sweep.py                    | 100 +++++++++++++++++++++-
 tests/test_ui_routes.py                   |   2 +
 9 files changed, 226 insertions(+), 4 deletions(-)

diff --git a/ocr_pipeline/main.py b/ocr_pipeline/main.py
index 76177f8..c2bbba7 100644
--- a/ocr_pipeline/main.py
+++ b/ocr_pipeline/main.py
@@ -43,6 +43,9 @@ async def lifespan(app: FastAPI):
     orphans = await db.fail_orphan_runs()
     if orphans:
         logger.warning("Marked %d orphan run(s) as failed (process restart).", orphans)
+    failed_docs = await db.fail_documents_for_failed_runs()
+    if failed_docs:
+        logger.warning("Marked %d incomplete document(s) in failed runs.", failed_docs)
 
     storage = RunStorage(output_root=settings.output_dir, runs_root=settings.runs_dir)
     init_orchestrator(db, storage)
diff --git a/ocr_pipeline/routers/runs.py b/ocr_pipeline/routers/runs.py
index e003d8d..b493a53 100644
--- a/ocr_pipeline/routers/runs.py
+++ b/ocr_pipeline/routers/runs.py
@@ -210,6 +210,37 @@ async def create_run(request: RunCreateRequest):
     )
 
 
+@router.post("/api/runs/{run_id}/retry", response_model=RunCreateResponse)
+async def retry_run(run_id: str = ID):
+    if not model_readiness.ready:
+        raise HTTPException(
+            status_code=503, detail=f"Model server not ready: {model_readiness.status}"
+        )
+    try:
+        result = await get_orchestrator().retry_incomplete_run(run_id)
+    except KeyError:
+        raise HTTPException(status_code=404, detail="Run not found")
+    except ValueError as exc:
+        raise HTTPException(status_code=409, detail=str(exc))
+
+    return RunCreateResponse(
+        run_id=result.run_id,
+        status="queued",
+        documents_total=len(result.documents),
+        pages_total_estimate=result.pages_total_estimate,
+        documents=[
+            StagedDocumentInfo(
+                document_id=d.document_id,
+                filename=d.filename,
+                file_sha256=d.file_sha256,
+                deduped=d.deduped,
+                estimated_pages=d.estimated_pages,
+            )
+            for d in result.documents
+        ],
+    )
+
+
 @router.get("/api/runs", response_model=list[RunSummary])
 async def list_runs(limit: int = Query(50, ge=1, le=500)):
     return [_run_summary(r) for r in await get_db().list_runs(limit=limit)]
diff --git a/ocr_pipeline/services/db.py b/ocr_pipeline/services/db.py
index d3337d6..0987a79 100644
--- a/ocr_pipeline/services/db.py
+++ b/ocr_pipeline/services/db.py
@@ -288,6 +288,12 @@ async def delete_run(self, run_id: str) -> None:
     async def fail_orphan_runs(self) -> int:
         """Mark any run still in `processing`/`queued` as failed. Called once
         on startup so a crashed process does not leave runs visibly live."""
+        async with self.conn.execute(
+            "SELECT id FROM runs WHERE status IN ('queued', 'processing')"
+        ) as cur:
+            run_ids = [row["id"] for row in await cur.fetchall()]
+        for run_id in run_ids:
+            await self.fail_incomplete_run_documents(run_id)
         cur = await self.conn.execute(
             """
             UPDATE runs
@@ -304,6 +310,35 @@ async def fail_orphan_runs(self) -> int:
         await cur.close()
         return affected
 
+    async def fail_incomplete_run_documents(self, run_id: str) -> None:
+        await self.conn.execute(
+            """
+            UPDATE run_documents
+               SET status = 'failed',
+                   completed_at = COALESCE(completed_at, ?)
+             WHERE run_id = ?
+               AND status != 'completed'
+            """,
+            (_now(), run_id),
+        )
+        await self.conn.commit()
+
+    async def fail_documents_for_failed_runs(self) -> int:
+        cur = await self.conn.execute(
+            """
+            UPDATE run_documents
+               SET status = 'failed',
+                   completed_at = COALESCE(completed_at, ?)
+             WHERE status != 'completed'
+               AND run_id IN (SELECT id FROM runs WHERE status = 'failed')
+            """,
+            (_now(),),
+        )
+        await self.conn.commit()
+        affected = cur.rowcount or 0
+        await cur.close()
+        return affected
+
     # ---------- documents (content-addressed) ----------
 
     async def upsert_document(
diff --git a/ocr_pipeline/services/run_orchestrator.py b/ocr_pipeline/services/run_orchestrator.py
index 1c09f9d..91c10b2 100644
--- a/ocr_pipeline/services/run_orchestrator.py
+++ b/ocr_pipeline/services/run_orchestrator.py
@@ -150,6 +150,36 @@ def start(
         task.add_done_callback(self._tasks.discard)
         return task
 
+    async def retry_incomplete_run(self, run_id: str) -> CreateRunResult:
+        run = await self.db.get_run(run_id)
+        if not run:
+            raise KeyError(run_id)
+        if run["status"] != "failed":
+            raise ValueError("Only failed runs can be retried")
+
+        documents = await self.db.list_run_documents(run_id)
+        retry_paths = [
+            doc["document_source_path"]
+            for doc in documents
+            if doc["status"] != "completed" and doc.get("document_source_path")
+        ]
+        if not retry_paths:
+            raise ValueError("No incomplete documents to retry")
+
+        name = run.get("name") or run_id
+        result = await self.create_run(
+            retry_paths,
+            name=f"{name} retry",
+            strip_refs=bool(run.get("strip_refs")),
+            export_parquet=bool(run.get("export_parquet")),
+        )
+        self.start(
+            result,
+            strip_refs=bool(run.get("strip_refs")),
+            export_parquet=bool(run.get("export_parquet")),
+        )
+        return result
+
     async def _run(
         self,
         result: CreateRunResult,
@@ -238,6 +268,7 @@ async def page_event(event: dict) -> None:
             )
         except Exception as exc:
             logger.exception("Run %s failed", run_id)
+            await self.db.fail_incomplete_run_documents(run_id)
             await self.db.update_run(
                 run_id,
                 status="failed",
diff --git a/ocr_pipeline/static/index.html b/ocr_pipeline/static/index.html
index 303f5a8..7f77bcc 100644
--- a/ocr_pipeline/static/index.html
+++ b/ocr_pipeline/static/index.html
@@ -224,6 +224,10 @@ <h2>Run <code x-text="selectedRun?.id"></code></h2>
         <div class="stage-actions">
           <span class="pill" :class="runStatusClass(selectedRun?.status)" x-text="selectedRun?.status"></span>
           <button class="btn btn-ghost btn-sm" @click="refreshSelectedRun">Refresh</button>
+          <button class="btn btn-primary btn-sm"
+                  :disabled="creating"
+                  @click="retryRun"
+                  x-show="selectedRun?.status === 'failed'">Retry incomplete</button>
           <button class="btn btn-ghost btn-sm" @click="deleteSelectedRun" x-show="selectedRun?.status !== 'processing'">Delete</button>
         </div>
       </header>
diff --git a/ocr_pipeline/static/js/api.js b/ocr_pipeline/static/js/api.js
index 66cc5a2..2150f0f 100644
--- a/ocr_pipeline/static/js/api.js
+++ b/ocr_pipeline/static/js/api.js
@@ -115,6 +115,15 @@ const API = {
     return res.json();
   },
 
+  async retryRun(runId) {
+    const res = await fetch(`/api/runs/${encodeURIComponent(runId)}/retry`, { method: 'POST' });
+    if (!res.ok) {
+      const data = await res.json().catch(() => ({}));
+      throw new Error(data.detail || 'Failed to retry run');
+    }
+    return res.json();
+  },
+
   async getRunDocument(runId, documentId) {
     const res = await fetch(
       `/api/runs/${encodeURIComponent(runId)}/documents/${encodeURIComponent(documentId)}`
diff --git a/ocr_pipeline/static/js/app.js b/ocr_pipeline/static/js/app.js
index c7c50f2..fb2c44d 100644
--- a/ocr_pipeline/static/js/app.js
+++ b/ocr_pipeline/static/js/app.js
@@ -444,6 +444,21 @@ function opencrApp() {
       }
     },
 
+    async retryRun() {
+      if (!this.selectedRunId || this.creating) return;
+      this.creating = true;
+      try {
+        const result = await API.retryRun(this.selectedRunId);
+        this.toast(`Retry run ${result.run_id} queued`, 'success');
+        await this.refreshRuns();
+        await this.selectRun(result.run_id);
+      } catch (e) {
+        this.toast(`Retry failed: ${e.message}`, 'error');
+      } finally {
+        this.creating = false;
+      }
+    },
+
     toggleSelected(path) {
       const i = this.selectedPaths.indexOf(path);
       if (i === -1) this.selectedPaths.push(path); else this.selectedPaths.splice(i, 1);
diff --git a/tests/test_db_sweep.py b/tests/test_db_sweep.py
index ae970ca..812f2e8 100644
--- a/tests/test_db_sweep.py
+++ b/tests/test_db_sweep.py
@@ -10,13 +10,20 @@ async def _scenario():
         try:
             for run_id in ("run-a", "run-b", "run-c"):
                 await db.create_run(
-                    run_id, name=None, documents_total=1, pages_total_estimate=1,
-                    strip_refs=False, export_parquet=False,
-                    pipeline_version="2.0.0", model_used="m",
+                    run_id,
+                    name=None,
+                    documents_total=1,
+                    pages_total_estimate=1,
+                    strip_refs=False,
+                    export_parquet=False,
+                    pipeline_version="2.0.0",
+                    model_used="m",
                 )
             await db.update_run("run-a", status="processing", stage="ocr")
             # run-b stays queued
-            await db.update_run("run-c", status="completed", stage="completed", progress=1.0)
+            await db.update_run(
+                "run-c", status="completed", stage="completed", progress=1.0
+            )
 
             affected = await db.fail_orphan_runs()
             assert affected == 2
@@ -30,3 +37,88 @@ async def _scenario():
             await db.close()
 
     asyncio.run(_scenario())
+
+
+def test_fail_orphan_runs_marks_incomplete_documents_failed(tmp_path):
+    async def _scenario():
+        db = Database(tmp_path / "opencr.sqlite")
+        await db.connect()
+        try:
+            await db.create_run(
+                "run-a",
+                name=None,
+                documents_total=3,
+                pages_total_estimate=3,
+                strip_refs=False,
+                export_parquet=False,
+                pipeline_version="2.0.0",
+                model_used="m",
+            )
+            for doc_id, status in (
+                ("doc-done", "completed"),
+                ("doc-active", "processing"),
+                ("doc-waiting", "pending"),
+            ):
+                await db.upsert_document(
+                    doc_id,
+                    filename=f"{doc_id}.pdf",
+                    source_path=f"/tmp/{doc_id}.pdf",
+                    file_sha256=doc_id,
+                    file_size_bytes=1,
+                )
+                await db.link_run_document("run-a", doc_id, status=status)
+            await db.update_run("run-a", status="processing", stage="ocr")
+
+            await db.fail_orphan_runs()
+
+            docs = {
+                doc["document_id"]: doc for doc in await db.list_run_documents("run-a")
+            }
+            assert docs["doc-done"]["status"] == "completed"
+            assert docs["doc-active"]["status"] == "failed"
+            assert docs["doc-waiting"]["status"] == "failed"
+        finally:
+            await db.close()
+
+    asyncio.run(_scenario())
+
+
+def test_failed_runs_normalize_incomplete_documents(tmp_path):
+    async def _scenario():
+        db = Database(tmp_path / "opencr.sqlite")
+        await db.connect()
+        try:
+            await db.create_run(
+                "run-a",
+                name=None,
+                documents_total=2,
+                pages_total_estimate=2,
+                strip_refs=False,
+                export_parquet=False,
+                pipeline_version="2.0.0",
+                model_used="m",
+            )
+            for doc_id, status in (
+                ("doc-active", "processing"),
+                ("doc-waiting", "pending"),
+            ):
+                await db.upsert_document(
+                    doc_id,
+                    filename=f"{doc_id}.pdf",
+                    source_path=f"/tmp/{doc_id}.pdf",
+                    file_sha256=doc_id,
+                    file_size_bytes=1,
+                )
+                await db.link_run_document("run-a", doc_id, status=status)
+            await db.update_run("run-a", status="failed", stage="failed")
+
+            affected = await db.fail_documents_for_failed_runs()
+
+            assert affected == 2
+            assert {doc["status"] for doc in await db.list_run_documents("run-a")} == {
+                "failed"
+            }
+        finally:
+            await db.close()
+
+    asyncio.run(_scenario())
diff --git a/tests/test_ui_routes.py b/tests/test_ui_routes.py
index aa11e99..06f3c1f 100644
--- a/tests/test_ui_routes.py
+++ b/tests/test_ui_routes.py
@@ -169,5 +169,7 @@ def test_home_uses_document_workbench():
     assert "downloadOCRPairs()" in app_js
     assert "selectedRunDocumentIds" in app_js
     assert "documentProcessLabel" in app_js
+    assert "retryRun()" in app_js
+    assert "Retry incomplete" in html
     assert "selectedPageText()" in app_js
     assert "saveSelectedDocument()" in app_js

From 2104d663a335d8ea9bffe1d3f198c555e2205cda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= <fatihburak@protonmail.com>
Date: Mon, 11 May 2026 21:59:35 +0300
Subject: [PATCH 07/15] feat: add dependency caching for local OCR engine,
 enhance observability logging, and improve startup configuration

---
 ocr_pipeline/Dockerfile                   |  2 +-
 ocr_pipeline/Dockerfile.cpu               |  2 +-
 ocr_pipeline/config.py                    |  1 +
 ocr_pipeline/main.py                      |  6 ++-
 ocr_pipeline/services/local_ocr_engine.py | 34 ++++++++-----
 ocr_pipeline/services/run_orchestrator.py | 59 +++++++++++++++++++++--
 requirements-local.txt                    |  1 +
 scripts/start.sh                          | 13 ++++-
 tests/test_local_ocr_engine.py            | 29 +++++++++++
 tests/test_requirements.py                |  8 +++
 10 files changed, 134 insertions(+), 21 deletions(-)
 create mode 100644 tests/test_local_ocr_engine.py
 create mode 100644 tests/test_requirements.py

diff --git a/ocr_pipeline/Dockerfile b/ocr_pipeline/Dockerfile
index 71e5ba0..53824e5 100644
--- a/ocr_pipeline/Dockerfile
+++ b/ocr_pipeline/Dockerfile
@@ -20,4 +20,4 @@ ENV PYTHONPATH=/app
 
 EXPOSE 39672
 
-CMD ["uvicorn", "ocr_pipeline.main:app", "--host", "0.0.0.0", "--port", "39672"]
+CMD ["uvicorn", "ocr_pipeline.main:app", "--host", "0.0.0.0", "--port", "39672", "--no-access-log"]
diff --git a/ocr_pipeline/Dockerfile.cpu b/ocr_pipeline/Dockerfile.cpu
index 7b1aedb..468104d 100644
--- a/ocr_pipeline/Dockerfile.cpu
+++ b/ocr_pipeline/Dockerfile.cpu
@@ -31,4 +31,4 @@ ENV LOCAL_DEVICE=cpu
 
 EXPOSE 39672
 
-CMD ["uvicorn", "ocr_pipeline.main:app", "--host", "0.0.0.0", "--port", "39672"]
+CMD ["uvicorn", "ocr_pipeline.main:app", "--host", "0.0.0.0", "--port", "39672", "--no-access-log"]
diff --git a/ocr_pipeline/config.py b/ocr_pipeline/config.py
index 4e4a019..017f52c 100644
--- a/ocr_pipeline/config.py
+++ b/ocr_pipeline/config.py
@@ -59,6 +59,7 @@ class Settings(BaseSettings):
     # Server
     host: str = "0.0.0.0"
     port: int = 39672
+    log_level: str = "INFO"
 
     # Pipeline
     pipeline_version: str = "2.0.0"
diff --git a/ocr_pipeline/main.py b/ocr_pipeline/main.py
index c2bbba7..dc13370 100644
--- a/ocr_pipeline/main.py
+++ b/ocr_pipeline/main.py
@@ -25,9 +25,11 @@
 from ocr_pipeline.services.startup import wait_for_model_server
 
 logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
+    level=getattr(logging, settings.log_level.upper(), logging.INFO),
+    format="%(asctime)s %(levelname)-7s %(name)s :: %(message)s",
 )
+for noisy_logger in ("httpx", "httpcore", "urllib3", "huggingface_hub"):
+    logging.getLogger(noisy_logger).setLevel(logging.WARNING)
 logger = logging.getLogger("ocr_pipeline")
 
 
diff --git a/ocr_pipeline/services/local_ocr_engine.py b/ocr_pipeline/services/local_ocr_engine.py
index b9816e9..a94acfe 100644
--- a/ocr_pipeline/services/local_ocr_engine.py
+++ b/ocr_pipeline/services/local_ocr_engine.py
@@ -97,29 +97,41 @@ def __init__(self, model_name: str | None = None) -> None:
         self._device: str | None = None
         self._dtype: Any = None
         self._lock = asyncio.Lock()
+        self._load_error: BaseException | None = None
         self._initialized = True
 
     async def _ensure_loaded(self) -> None:
         if self._model is not None:
             return
+        if self._load_error is not None:
+            raise RuntimeError("Local OCR engine failed to load") from self._load_error
         async with self._lock:
             if self._model is not None:
                 return
-            await asyncio.to_thread(self._load_blocking)
+            if self._load_error is not None:
+                raise RuntimeError(
+                    "Local OCR engine failed to load"
+                ) from self._load_error
+            try:
+                await asyncio.to_thread(self._load_blocking)
+            except Exception as exc:
+                self._load_error = exc
+                logger.error("Local OCR engine failed to load: %s", exc)
+                raise
 
     def _load_blocking(self) -> None:
-        if find_spec("torch") is None:
+        missing = [
+            package
+            for package in ("torch", "transformers", "easydict")
+            if find_spec(package) is None
+        ]
+        if missing:
             raise RuntimeError(
-                "MODEL_BACKEND=local requires `transformers` and `torch`. "
-                "Install with: pip install -r requirements-local.txt"
+                "MODEL_BACKEND=local missing package(s): "
+                f"{', '.join(missing)}. Install with: "
+                "pip install -r ocr_pipeline/requirements.txt -r requirements-local.txt"
             )
-        try:
-            from transformers import AutoModel, AutoTokenizer
-        except ImportError as exc:
-            raise RuntimeError(
-                "MODEL_BACKEND=local requires `transformers` and `torch`. "
-                "Install with: pip install -r requirements-local.txt"
-            ) from exc
+        from transformers import AutoModel, AutoTokenizer
 
         device = _resolve_device(settings.local_device)
         dtype = _resolve_dtype(settings.local_dtype, device)
diff --git a/ocr_pipeline/services/run_orchestrator.py b/ocr_pipeline/services/run_orchestrator.py
index 91c10b2..45fa6b4 100644
--- a/ocr_pipeline/services/run_orchestrator.py
+++ b/ocr_pipeline/services/run_orchestrator.py
@@ -133,6 +133,13 @@ async def create_run(
         for s in staged:
             await self.db.link_run_document(run_id, s.document_id, status="pending")
 
+        logger.info(
+            "run=%s queued docs=%d pages=%d name=%s",
+            run_id,
+            len(staged),
+            pages_total,
+            name or "-",
+        )
         observability.job_created()
         return CreateRunResult(
             run_id=run_id, documents=staged, pages_total_estimate=pages_total
@@ -167,6 +174,7 @@ async def retry_incomplete_run(self, run_id: str) -> CreateRunResult:
             raise ValueError("No incomplete documents to retry")
 
         name = run.get("name") or run_id
+        logger.info("run=%s retry queued incomplete_docs=%d", run_id, len(retry_paths))
         result = await self.create_run(
             retry_paths,
             name=f"{name} retry",
@@ -189,15 +197,20 @@ async def _run(
     ) -> None:
         run_id = result.run_id
         started_at = _now()
+        pages_total = result.pages_total_estimate
+        pages_completed = 0
+        documents_meta: list = []
         await self.db.update_run(
             run_id, status="processing", stage="ocr", started_at=started_at
         )
+        logger.info(
+            "run=%s started docs=%d pages=%d",
+            run_id,
+            len(result.documents),
+            pages_total,
+        )
         await self._emit(run_id, "run_started", {"started_at": started_at})
 
-        pages_total = result.pages_total_estimate
-        pages_completed = 0
-        documents_meta: list = []
-
         async def page_event(event: dict) -> None:
             nonlocal pages_completed
             etype = event.get("type")
@@ -214,15 +227,39 @@ async def page_event(event: dict) -> None:
                     token_count=event.get("token_count", 0),
                     validation_status=event.get("validation_status", "pass"),
                 )
+                logger.info(
+                    "run=%s page=%d/%d doc=%s status=%s time=%.1fms",
+                    run_id,
+                    pages_completed,
+                    pages_total,
+                    event.get("document"),
+                    event.get("validation_status"),
+                    event.get("processing_time_ms", 0.0),
+                )
             elif etype == "page_retry":
                 observability.page_retry()
+                logger.info(
+                    "run=%s page=%s retry attempt=%s strategy=%s reason=%s",
+                    run_id,
+                    event.get("page"),
+                    event.get("attempt"),
+                    event.get("new_strategy"),
+                    event.get("reason"),
+                )
             await self._emit(run_id, etype, event)
 
         try:
-            for staged in result.documents:
+            for index, staged in enumerate(result.documents, start=1):
                 paths = self.storage.artifact_paths(
                     run_id, staged.document_id, staged.filename
                 )
+                logger.info(
+                    "run=%s doc=%d/%d started %s",
+                    run_id,
+                    index,
+                    len(result.documents),
+                    staged.filename,
+                )
                 processor = BatchProcessor(
                     self.db, event_callback=page_event, strip_refs=strip_refs
                 )
@@ -238,6 +275,16 @@ async def page_event(event: dict) -> None:
                 await self.db.update_run(
                     run_id, documents_completed=len(documents_meta)
                 )
+                logger.info(
+                    "run=%s doc=%d/%d completed %s pass=%d warn=%d fail=%d",
+                    run_id,
+                    index,
+                    len(result.documents),
+                    staged.filename,
+                    doc_meta.pages_pass,
+                    doc_meta.pages_warn,
+                    doc_meta.pages_fail,
+                )
 
             dataset_bundle = await self._maybe_export(
                 run_id, documents_meta, export_parquet
@@ -253,6 +300,7 @@ async def page_event(event: dict) -> None:
                 dataset_bundle=dataset_bundle,
                 completed_at=completed_at,
             )
+            logger.info("run=%s completed bundle=%s", run_id, dataset_bundle or "-")
             observability.job_completed()
             await self._emit(
                 run_id,
@@ -285,6 +333,7 @@ async def _maybe_export(
         if not (export_parquet and documents_meta):
             return None
         await self.db.update_run(run_id, stage="exporting")
+        logger.info("run=%s exporting dataset docs=%d", run_id, len(documents_meta))
         await self._emit(run_id, "dataset_export_started", {})
         exports = []
         for did, paths, meta in documents_meta:
diff --git a/requirements-local.txt b/requirements-local.txt
index bf4bf1d..2a62e29 100644
--- a/requirements-local.txt
+++ b/requirements-local.txt
@@ -10,3 +10,4 @@ transformers>=4.46.0
 accelerate>=0.34.0
 einops>=0.8.0
 sentencepiece>=0.2.0
+easydict>=1.13
diff --git a/scripts/start.sh b/scripts/start.sh
index 314a9da..7bf3842 100755
--- a/scripts/start.sh
+++ b/scripts/start.sh
@@ -31,9 +31,20 @@ mkdir -p "$INPUT_DIR" "$OUTPUT_DIR"
 export PYTHONPATH="${PYTHONPATH:-}:$(pwd)"
 PORT="${PORT:-39672}"
 HOST="${HOST:-0.0.0.0}"
+if [[ -z "${PYTHON_BIN:-}" && -x ".venv/bin/python" ]]; then
+  PYTHON_BIN=".venv/bin/python"
+else
+  PYTHON_BIN="${PYTHON_BIN:-python3}"
+fi
 
 echo "→ OpenCR  backend=$MODEL_BACKEND  http://$HOST:$PORT"
 echo "  input=$INPUT_DIR"
 echo "  output=$OUTPUT_DIR"
+echo "  python=$PYTHON_BIN"
+
+UVICORN_ARGS=()
+if [[ "${ACCESS_LOG:-0}" != "1" ]]; then
+  UVICORN_ARGS+=(--no-access-log)
+fi
 
-exec python3 -m uvicorn ocr_pipeline.main:app --host "$HOST" --port "$PORT" "$@"
+exec "$PYTHON_BIN" -m uvicorn ocr_pipeline.main:app --host "$HOST" --port "$PORT" "${UVICORN_ARGS[@]}" "$@"
diff --git a/tests/test_local_ocr_engine.py b/tests/test_local_ocr_engine.py
new file mode 100644
index 0000000..4c65c57
--- /dev/null
+++ b/tests/test_local_ocr_engine.py
@@ -0,0 +1,29 @@
+import asyncio
+
+from ocr_pipeline.services.local_ocr_engine import LocalOCREngine
+
+
+def test_local_engine_caches_load_failure(monkeypatch):
+    async def _scenario():
+        LocalOCREngine._instance = None
+        engine = LocalOCREngine()
+        calls = 0
+
+        def fail_load():
+            nonlocal calls
+            calls += 1
+            raise RuntimeError("missing dependency")
+
+        monkeypatch.setattr(engine, "_load_blocking", fail_load)
+
+        for _ in range(2):
+            try:
+                await engine._ensure_loaded()
+            except RuntimeError:
+                pass
+            else:
+                raise AssertionError("expected load failure")
+
+        assert calls == 1
+
+    asyncio.run(_scenario())
diff --git a/tests/test_requirements.py b/tests/test_requirements.py
new file mode 100644
index 0000000..6807291
--- /dev/null
+++ b/tests/test_requirements.py
@@ -0,0 +1,8 @@
+from pathlib import Path
+
+
+def test_local_requirements_include_deepseek_remote_code_dependencies():
+    requirements = (Path(__file__).parents[1] / "requirements-local.txt").read_text(
+        encoding="utf-8"
+    )
+    assert "easydict" in requirements

From c4dbb91b4fa42a807d0cf09a9a682facbf790696 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= <fatihburak@protonmail.com>
Date: Mon, 11 May 2026 22:03:12 +0300
Subject: [PATCH 08/15] chore: update dependency versions and add addict to
 local requirements and engine checks

---
 ocr_pipeline/services/local_ocr_engine.py | 2 +-
 requirements-local.txt                    | 4 +++-
 tests/test_requirements.py                | 3 +++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/ocr_pipeline/services/local_ocr_engine.py b/ocr_pipeline/services/local_ocr_engine.py
index a94acfe..2c73c4b 100644
--- a/ocr_pipeline/services/local_ocr_engine.py
+++ b/ocr_pipeline/services/local_ocr_engine.py
@@ -122,7 +122,7 @@ async def _ensure_loaded(self) -> None:
     def _load_blocking(self) -> None:
         missing = [
             package
-            for package in ("torch", "transformers", "easydict")
+            for package in ("torch", "transformers", "tokenizers", "addict", "easydict")
             if find_spec(package) is None
         ]
         if missing:
diff --git a/requirements-local.txt b/requirements-local.txt
index 2a62e29..738ece4 100644
--- a/requirements-local.txt
+++ b/requirements-local.txt
@@ -6,8 +6,10 @@
 # CUDA-matched wheels from https://pytorch.org/ instead of letting pip pick.
 torch>=2.4.0
 torchvision>=0.19.0
-transformers>=4.46.0
+transformers==4.46.3
+tokenizers==0.20.3
 accelerate>=0.34.0
 einops>=0.8.0
 sentencepiece>=0.2.0
+addict>=2.4.0
 easydict>=1.13
diff --git a/tests/test_requirements.py b/tests/test_requirements.py
index 6807291..707b8fa 100644
--- a/tests/test_requirements.py
+++ b/tests/test_requirements.py
@@ -5,4 +5,7 @@ def test_local_requirements_include_deepseek_remote_code_dependencies():
     requirements = (Path(__file__).parents[1] / "requirements-local.txt").read_text(
         encoding="utf-8"
     )
+    assert "transformers==4.46.3" in requirements
+    assert "tokenizers==0.20.3" in requirements
+    assert "addict" in requirements
     assert "easydict" in requirements

From a3489d1464df289fa9954e86d2c293acf4108231 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= <fatihburak@protonmail.com>
Date: Mon, 11 May 2026 22:16:17 +0300
Subject: [PATCH 09/15] feat: add local model cache verification and expose
 status in API and UI

---
 ocr_pipeline/models/schemas.py   |  2 +
 ocr_pipeline/routers/health.py   |  4 +-
 ocr_pipeline/services/startup.py | 71 +++++++++++++++++++++++++++-----
 ocr_pipeline/static/js/app.js    |  6 ++-
 tests/test_startup.py            | 33 +++++++++++++++
 5 files changed, 102 insertions(+), 14 deletions(-)
 create mode 100644 tests/test_startup.py

diff --git a/ocr_pipeline/models/schemas.py b/ocr_pipeline/models/schemas.py
index bdf0e79..0f42978 100644
--- a/ocr_pipeline/models/schemas.py
+++ b/ocr_pipeline/models/schemas.py
@@ -74,6 +74,8 @@ class HealthResponse(BaseModel):
     model_status: str
     input_dir: str = ""
     output_dir: str = ""
+    local_model_cached: Optional[bool] = None
+    local_model_cache_dir: Optional[str] = None
 
 
 class FileInfo(BaseModel):
diff --git a/ocr_pipeline/routers/health.py b/ocr_pipeline/routers/health.py
index 13878ad..dd6a8d5 100644
--- a/ocr_pipeline/routers/health.py
+++ b/ocr_pipeline/routers/health.py
@@ -12,7 +12,7 @@
 async def health_check():
 
     status = "ready" if model_readiness.ready else "waiting"
-    
+
     resp = HealthResponse(
         status=status,
         pipeline_version=settings.pipeline_version,
@@ -21,6 +21,8 @@ async def health_check():
         model_status=model_readiness.status,
         input_dir=str(settings.input_dir),
         output_dir=str(settings.output_dir),
+        local_model_cached=model_readiness.local_model_cached,
+        local_model_cache_dir=model_readiness.local_model_cache_dir,
     )
     if not model_readiness.ready:
         return JSONResponse(content=resp.model_dump(), status_code=503)
diff --git a/ocr_pipeline/services/startup.py b/ocr_pipeline/services/startup.py
index cc42c34..c68fa75 100644
--- a/ocr_pipeline/services/startup.py
+++ b/ocr_pipeline/services/startup.py
@@ -3,6 +3,7 @@
 import time
 
 import httpx
+from huggingface_hub import try_to_load_from_cache
 
 from ocr_pipeline.config import settings
 
@@ -17,10 +18,15 @@ def __init__(self):
         self.model_name: str | None = None
         self.error: str | None = None
         self.checked_at: float = 0
+        self.local_model_cached: bool | None = None
+        self.local_model_cache_dir: str | None = None
+        self.note: str | None = None
 
     @property
     def status(self) -> str:
         if self.ready:
+            if self.note:
+                return self.note
             return "ready"
         if self.error:
             return f"waiting ({self.error})"
@@ -30,6 +36,52 @@ def status(self) -> str:
 model_readiness = ModelReadiness()
 
 
+def _local_model_cache_files_present() -> bool:
+    required_files = (
+        "config.json",
+        "tokenizer_config.json",
+        "tokenizer.json",
+        "model.safetensors.index.json",
+    )
+    return all(
+        try_to_load_from_cache(
+            settings.model_name,
+            filename,
+            cache_dir=settings.local_model_cache,
+        )
+        for filename in required_files
+    )
+
+
+async def configure_local_readiness(
+    readiness: ModelReadiness = model_readiness,
+) -> bool:
+    cached = await asyncio.to_thread(_local_model_cache_files_present)
+    readiness.ready = True
+    readiness.model_name = settings.model_name
+    readiness.error = None
+    readiness.checked_at = time.time()
+    readiness.local_model_cached = cached
+    readiness.local_model_cache_dir = str(settings.local_model_cache)
+    readiness.note = (
+        "ready (local model cached)"
+        if cached
+        else ("ready (local model will download on first extraction)")
+    )
+    if cached:
+        logger.info(
+            "Local backend selected; model cache found at %s.",
+            settings.local_model_cache,
+        )
+    else:
+        logger.warning(
+            "Local backend selected; model is not fully cached at %s. "
+            "First extraction will download model files.",
+            settings.local_model_cache,
+        )
+    return True
+
+
 async def wait_for_model_server() -> bool:
     """
     Block until the model server is healthy and can list its model.
@@ -39,21 +91,14 @@ async def wait_for_model_server() -> bool:
     loads lazily on the first request — so we mark ready immediately.
     """
     if settings.is_local_backend:
-        model_readiness.ready = True
-        model_readiness.model_name = settings.model_name
-        model_readiness.error = None
-        model_readiness.checked_at = time.time()
-        logger.info("Local backend selected; model will load on first request.")
-        return True
+        return await configure_local_readiness()
 
     base = settings.model_server_url
     timeout = settings.model_ready_timeout
     interval = settings.model_ready_interval
     deadline = time.monotonic() + timeout
 
-    logger.info(
-        "Waiting for model server at %s (timeout %ds)...", base, timeout
-    )
+    logger.info("Waiting for model server at %s (timeout %ds)...", base, timeout)
 
     async with httpx.AsyncClient(timeout=10) as client:
         while time.monotonic() < deadline:
@@ -61,12 +106,16 @@ async def wait_for_model_server() -> bool:
                 resp = await client.get(f"{base}/health")
                 if resp.status_code != 200:
                     model_readiness.error = f"health returned {resp.status_code}"
-                    logger.info("Model server not healthy yet (%s)", model_readiness.error)
+                    logger.info(
+                        "Model server not healthy yet (%s)", model_readiness.error
+                    )
                     await asyncio.sleep(interval)
                     continue
             except (httpx.ConnectError, httpx.ReadTimeout, httpx.ConnectTimeout) as exc:
                 model_readiness.error = f"connection failed ({type(exc).__name__})"
-                logger.info("Model server not reachable yet (%s)", model_readiness.error)
+                logger.info(
+                    "Model server not reachable yet (%s)", model_readiness.error
+                )
                 await asyncio.sleep(interval)
                 continue
 
diff --git a/ocr_pipeline/static/js/app.js b/ocr_pipeline/static/js/app.js
index fb2c44d..c94e396 100644
--- a/ocr_pipeline/static/js/app.js
+++ b/ocr_pipeline/static/js/app.js
@@ -95,8 +95,10 @@ function opencrApp() {
       try {
         const data = await API.health();
         this.version = data.pipeline_version || '';
-        this.healthStatus = data.status;
-        this.healthClass = data.status === 'ready' ? 'ready' : 'waiting';
+        this.healthStatus = data.model_status || data.status;
+        this.healthClass = data.status === 'ready'
+          ? (data.local_model_cached === false ? 'waiting' : 'ready')
+          : 'waiting';
       } catch {
         this.healthStatus = 'offline';
         this.healthClass = 'error';
diff --git a/tests/test_startup.py b/tests/test_startup.py
new file mode 100644
index 0000000..e55d0f7
--- /dev/null
+++ b/tests/test_startup.py
@@ -0,0 +1,33 @@
+import asyncio
+
+from ocr_pipeline.services.startup import ModelReadiness, configure_local_readiness
+
+
+def test_local_readiness_reports_cached_model(monkeypatch, tmp_path):
+    readiness = ModelReadiness()
+
+    monkeypatch.setattr(
+        "ocr_pipeline.services.startup.try_to_load_from_cache",
+        lambda _repo_id, filename, **_kwargs: str(tmp_path / filename),
+    )
+
+    asyncio.run(configure_local_readiness(readiness))
+
+    assert readiness.ready is True
+    assert readiness.local_model_cached is True
+    assert "cached" in readiness.status
+
+
+def test_local_readiness_reports_download_needed(monkeypatch):
+    readiness = ModelReadiness()
+
+    monkeypatch.setattr(
+        "ocr_pipeline.services.startup.try_to_load_from_cache",
+        lambda *_args, **_kwargs: None,
+    )
+
+    asyncio.run(configure_local_readiness(readiness))
+
+    assert readiness.ready is True
+    assert readiness.local_model_cached is False
+    assert "will download on first extraction" in readiness.status

From a555143016d07b94a57bc6520111af9ddb1fdc16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= <fatihburak@pm.me>
Date: Mon, 11 May 2026 22:23:19 +0300
Subject: [PATCH 10/15] fix: opencr no longer assumes cuda means flash
 attention

---
 README.md                                 |  1 +
 ocr_pipeline/config.py                    |  3 +++
 ocr_pipeline/services/local_ocr_engine.py | 22 +++++++++++++---
 tests/test_local_ocr_engine.py            | 32 ++++++++++++++++++++++-
 4 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index cc85549..ff3bc52 100644
--- a/README.md
+++ b/README.md
@@ -69,6 +69,7 @@ Configurable via environment variables (or a `.env` file):
 | `MODEL_NAME`         | `deepseek-ai/DeepSeek-OCR`       | Model identifier.                                                                                 |
 | `MODEL_API_KEY`      | `EMPTY`                          | API key for remote endpoints.                                                                     |
 | `LOCAL_DEVICE`       | auto                             | `auto`, `mps`, `cuda`, or `cpu` for the `local` backend.                                          |
+| `LOCAL_ATTN_IMPLEMENTATION` | auto                      | `auto`, `eager`, `sdpa`, or `flash_attention_2`. Auto uses FlashAttention only when `flash_attn` is installed. |
 | `INPUT_DIR`          | `./input` (or `/data/input`)     | Where to read PDFs from.                                                                          |
 | `OUTPUT_DIR`         | `./output` (or `/data/output`)   | Where artifacts and the SQLite DB land.                                                           |
 | `HOST` / `PORT`      | `0.0.0.0` / `39672`              | Where the web console serves.                                                                     |
diff --git a/ocr_pipeline/config.py b/ocr_pipeline/config.py
index 017f52c..d2b13b2 100644
--- a/ocr_pipeline/config.py
+++ b/ocr_pipeline/config.py
@@ -30,6 +30,9 @@ class Settings(BaseSettings):
     # Local backend (Apple Silicon / CPU)
     local_device: Literal["auto", "mps", "cuda", "cpu"] = "auto"
     local_dtype: Literal["auto", "float16", "bfloat16", "float32"] = "auto"
+    local_attn_implementation: Literal["auto", "eager", "sdpa", "flash_attention_2"] = (
+        "auto"
+    )
     local_model_cache: Path = Path.home() / ".cache" / "huggingface"
 
     # Startup readiness (used by the remote backend)
diff --git a/ocr_pipeline/services/local_ocr_engine.py b/ocr_pipeline/services/local_ocr_engine.py
index 2c73c4b..426700c 100644
--- a/ocr_pipeline/services/local_ocr_engine.py
+++ b/ocr_pipeline/services/local_ocr_engine.py
@@ -73,6 +73,21 @@ def _resolve_dtype(requested: str, device: str):
     return torch.float32
 
 
+def _resolve_attn_implementation(requested: str, device: str) -> str:
+    if requested != "auto":
+        if requested == "flash_attention_2" and find_spec("flash_attn") is None:
+            raise RuntimeError(
+                "LOCAL_ATTN_IMPLEMENTATION=flash_attention_2 requires `flash_attn`. "
+                "Install flash-attn, or unset LOCAL_ATTN_IMPLEMENTATION to use eager "
+                "attention."
+            )
+        return requested
+
+    if device == "cuda" and find_spec("flash_attn") is not None:
+        return "flash_attention_2"
+    return "eager"
+
+
 class LocalOCREngine:
     """In-process DeepSeek-OCR inference via `transformers`.
 
@@ -142,9 +157,10 @@ def _load_blocking(self) -> None:
             dtype,
         )
 
-        # eager attention works everywhere; flash-attn-2 is CUDA-only and would
-        # break MPS/CPU loads.
-        attn_impl = "flash_attention_2" if device == "cuda" else "eager"
+        attn_impl = _resolve_attn_implementation(
+            settings.local_attn_implementation, device
+        )
+        logger.info("Using %s attention implementation.", attn_impl)
 
         tokenizer = AutoTokenizer.from_pretrained(
             self.model_name,
diff --git a/tests/test_local_ocr_engine.py b/tests/test_local_ocr_engine.py
index 4c65c57..a69c616 100644
--- a/tests/test_local_ocr_engine.py
+++ b/tests/test_local_ocr_engine.py
@@ -1,6 +1,11 @@
 import asyncio
 
-from ocr_pipeline.services.local_ocr_engine import LocalOCREngine
+import pytest
+
+from ocr_pipeline.services.local_ocr_engine import (
+    LocalOCREngine,
+    _resolve_attn_implementation,
+)
 
 
 def test_local_engine_caches_load_failure(monkeypatch):
@@ -27,3 +32,28 @@ def fail_load():
         assert calls == 1
 
     asyncio.run(_scenario())
+
+
+def test_local_attn_auto_uses_eager_when_flash_attn_missing(monkeypatch):
+    monkeypatch.setattr(
+        "ocr_pipeline.services.local_ocr_engine.find_spec", lambda _name: None
+    )
+
+    assert _resolve_attn_implementation("auto", "cuda") == "eager"
+
+
+def test_local_attn_auto_uses_flash_when_available_on_cuda(monkeypatch):
+    monkeypatch.setattr(
+        "ocr_pipeline.services.local_ocr_engine.find_spec", lambda _name: object()
+    )
+
+    assert _resolve_attn_implementation("auto", "cuda") == "flash_attention_2"
+
+
+def test_local_attn_forced_flash_requires_flash_attn(monkeypatch):
+    monkeypatch.setattr(
+        "ocr_pipeline.services.local_ocr_engine.find_spec", lambda _name: None
+    )
+
+    with pytest.raises(RuntimeError, match="requires `flash_attn`"):
+        _resolve_attn_implementation("flash_attention_2", "cuda")

From 8985ee8fd2abc7edb5d25df14a0c16696095db4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= <fatihburak@protonmail.com>
Date: Mon, 11 May 2026 22:43:18 +0300
Subject: [PATCH 11/15] feat: add grounding box removal to text cleaner and
 suppress verbose model stdout during local inference

---
 ocr_pipeline/services/local_ocr_engine.py | 50 +++++++++++++----------
 ocr_pipeline/services/text_cleaner.py     | 21 ++++++----
 tests/test_local_ocr_engine.py            | 40 ++++++++++++++++++
 tests/test_text_cleaner.py                | 24 +++++++++++
 4 files changed, 107 insertions(+), 28 deletions(-)

diff --git a/ocr_pipeline/services/local_ocr_engine.py b/ocr_pipeline/services/local_ocr_engine.py
index 426700c..c897dee 100644
--- a/ocr_pipeline/services/local_ocr_engine.py
+++ b/ocr_pipeline/services/local_ocr_engine.py
@@ -18,8 +18,10 @@
 from __future__ import annotations
 
 import asyncio
+import io
 import logging
 import tempfile
+from contextlib import redirect_stdout
 from importlib.util import find_spec
 from pathlib import Path
 from typing import Any
@@ -206,27 +208,33 @@ def _infer_blocking(self, image: Image.Image, prompt: str) -> str:
             image_path = tmp / "page.png"
             image.save(image_path, format="PNG")
 
-            try:
-                result = self._model.infer(
-                    self._tokenizer,
-                    prompt=prompt,
-                    image_file=str(image_path),
-                    output_path=str(tmp),
-                    base_size=1024,
-                    image_size=640,
-                    crop_mode=True,
-                    save_results=False,
-                    test_compress=False,
-                )
-            except TypeError:
-                # Older variants of the remote-code helper had a slightly
-                # different signature; fall back to the minimal kwargs.
-                result = self._model.infer(
-                    self._tokenizer,
-                    prompt=prompt,
-                    image_file=str(image_path),
-                    output_path=str(tmp),
-                )
+            remote_stdout = io.StringIO()
+            with redirect_stdout(remote_stdout):
+                try:
+                    result = self._model.infer(
+                        self._tokenizer,
+                        prompt=prompt,
+                        image_file=str(image_path),
+                        output_path=str(tmp),
+                        base_size=1024,
+                        image_size=640,
+                        crop_mode=True,
+                        save_results=False,
+                        test_compress=False,
+                        eval_mode=True,
+                    )
+                except TypeError:
+                    # Older variants of the remote-code helper had a slightly
+                    # different signature; fall back to the minimal kwargs.
+                    result = self._model.infer(
+                        self._tokenizer,
+                        prompt=prompt,
+                        image_file=str(image_path),
+                        output_path=str(tmp),
+                    )
+
+            if remote_stdout.getvalue():
+                logger.debug("Suppressed verbose model stdout during local inference.")
 
             if isinstance(result, str):
                 return result
diff --git a/ocr_pipeline/services/text_cleaner.py b/ocr_pipeline/services/text_cleaner.py
index f4056c1..0636323 100644
--- a/ocr_pipeline/services/text_cleaner.py
+++ b/ocr_pipeline/services/text_cleaner.py
@@ -24,13 +24,20 @@ class TextCleaner:
     ]
 
     ARTIFACT_PATTERNS = [
-        re.compile(r"<\|[a-z_]+\|>"),   # Any remaining special tokens
-        re.compile(r"\x00"),            # Null bytes
+        re.compile(r"<\|/?[a-z_]+\|>"),  # Any remaining special tokens
+        re.compile(r"\x00"),             # Null bytes
     ]
 
-    # <|ref|>text<|/ref|>[[x, y, w, h]] — model reference blocks with bounding boxes
+    # <|ref|>text<|/ref|><|det|>[[x, y, w, h]]<|/det|> — grounding boxes
+    # are useful for debugging, but should not leak into clean corpus text.
+    _REF_DET_BLOCK_RE = re.compile(
+        r"<\|ref\|>.*?<\|/ref\|>\s*<\|det\|>\s*\[\[.*?\]\]\s*<\|/det\|>\s*",
+        re.DOTALL,
+    )
+
+    # Older/simple reference block shape without explicit det tags.
     _REF_BLOCK_RE = re.compile(
-        r"<\|ref\|>(.*?)<\|/ref\|>\[\[[\d\s,]+\]\]",
+        r"<\|ref\|>(.*?)<\|/ref\|>\s*\[\[[\d\s,]+\]\]",
         re.DOTALL,
     )
 
@@ -43,8 +50,7 @@ def clean(self, text: str, strip_refs: bool = False) -> str:
             return ""
 
         text = self._normalize_unicode(text)
-        if strip_refs:
-            text = self._strip_ref_blocks(text)
+        text = self._strip_ref_blocks(text)
         text = self._strip_model_tokens(text)
         text = self._strip_artifacts(text)
         text = self._rejoin_hyphens(text)
@@ -65,7 +71,8 @@ def clean_fidelity(self, text: str, strip_refs: bool = False) -> str:
         return text.strip()
 
     def _strip_ref_blocks(self, text: str) -> str:
-        """Remove <|ref|>...<|/ref|>[[bbox]] blocks, keeping the inner text."""
+        """Remove grounding boxes while preserving older inline ref text."""
+        text = self._REF_DET_BLOCK_RE.sub("", text)
         return self._REF_BLOCK_RE.sub(r"\1", text)
 
     def _rejoin_hyphens(self, text: str) -> str:
diff --git a/tests/test_local_ocr_engine.py b/tests/test_local_ocr_engine.py
index a69c616..2e3964f 100644
--- a/tests/test_local_ocr_engine.py
+++ b/tests/test_local_ocr_engine.py
@@ -1,6 +1,7 @@
 import asyncio
 
 import pytest
+from PIL import Image
 
 from ocr_pipeline.services.local_ocr_engine import (
     LocalOCREngine,
@@ -57,3 +58,42 @@ def test_local_attn_forced_flash_requires_flash_attn(monkeypatch):
 
     with pytest.raises(RuntimeError, match="requires `flash_attn`"):
         _resolve_attn_implementation("flash_attention_2", "cuda")
+
+
+def test_local_infer_uses_eval_mode_so_text_is_returned(monkeypatch):
+    LocalOCREngine._instance = None
+    engine = LocalOCREngine()
+    engine._tokenizer = object()
+
+    calls = {}
+
+    class FakeModel:
+        def infer(self, tokenizer, **kwargs):
+            calls.update(kwargs)
+            return "recognized text"
+
+    engine._model = FakeModel()
+
+    result = engine._infer_blocking(Image.new("RGB", (8, 8)), "<image>\nFree OCR.")
+
+    assert result == "recognized text"
+    assert calls["eval_mode"] is True
+    assert calls["save_results"] is False
+
+
+def test_local_infer_suppresses_remote_model_stdout(capsys):
+    LocalOCREngine._instance = None
+    engine = LocalOCREngine()
+    engine._tokenizer = object()
+
+    class FakeModel:
+        def infer(self, tokenizer, **kwargs):
+            print("remote model debug noise")
+            return "recognized text"
+
+    engine._model = FakeModel()
+
+    result = engine._infer_blocking(Image.new("RGB", (8, 8)), "<image>\nFree OCR.")
+
+    assert result == "recognized text"
+    assert "remote model debug noise" not in capsys.readouterr().out
diff --git a/tests/test_text_cleaner.py b/tests/test_text_cleaner.py
index 0179c5e..aa2a3af 100644
--- a/tests/test_text_cleaner.py
+++ b/tests/test_text_cleaner.py
@@ -44,6 +44,22 @@ def test_strips_grounding_tokens(self, cleaner):
         result = cleaner.clean(text)
         assert "<|grounding|>" not in result
 
+    def test_clean_removes_grounding_boxes_without_dropping_text(self, cleaner):
+        text = (
+            "<|ref|>text<|/ref|><|det|>[[161, 580, 667, 653]]<|/det|>\n"
+            "(Yirmidokuzuncu madde) Saltanat-ı seniyenin asakir-i müs- "
+            "tahfaza ikamesi hukuku."
+        )
+
+        result = cleaner.clean(text)
+
+        assert result == (
+            "(Yirmidokuzuncu madde) Saltanat-ı seniyenin asakir-i müs- "
+            "tahfaza ikamesi hukuku."
+        )
+        assert "<|ref|>" not in result
+        assert "[[161, 580, 667, 653]]" not in result
+
 
 class TestArtifactRemoval:
     def test_strips_remaining_special_tokens(self, cleaner):
@@ -75,6 +91,14 @@ def test_fidelity_clean_preserves_page_spacing(self, cleaner):
         result = cleaner.clean_fidelity(text)
         assert result == "Line 1\n\n\nLine 2"
 
+    def test_fidelity_clean_can_strip_grounding_boxes(self, cleaner):
+        text = (
+            "<|ref|>text<|/ref|><|det|>[[161, 580, 667, 653]]<|/det|>\n"
+            "Visible line"
+        )
+        result = cleaner.clean_fidelity(text, strip_refs=True)
+        assert result == "Visible line"
+
 
 class TestOCRFixes:
     def test_curly_quotes_replaced(self, cleaner):

From 93f4303a7465382f4351e644b870e76a75ab0318 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= <fatihburak@protonmail.com>
Date: Mon, 11 May 2026 22:43:23 +0300
Subject: [PATCH 12/15] feat: implement TextBundleExporter to support
 downloading OCR text exports for documents

---
 ocr_pipeline/routers/runs.py                  |  63 +++++
 ocr_pipeline/services/text_bundle_exporter.py | 250 ++++++++++++++++++
 ocr_pipeline/static/index.html                |   3 +
 ocr_pipeline/static/js/api.js                 |   7 +
 ocr_pipeline/static/js/app.js                 |   9 +
 tests/test_text_bundle_exporter.py            | 155 +++++++++++
 tests/test_ui_routes.py                       |   2 +
 7 files changed, 489 insertions(+)
 create mode 100644 ocr_pipeline/services/text_bundle_exporter.py
 create mode 100644 tests/test_text_bundle_exporter.py

diff --git a/ocr_pipeline/routers/runs.py b/ocr_pipeline/routers/runs.py
index b493a53..2d90436 100644
--- a/ocr_pipeline/routers/runs.py
+++ b/ocr_pipeline/routers/runs.py
@@ -36,6 +36,7 @@
 from ocr_pipeline.services.pdf_renderer import PDFRenderer
 from ocr_pipeline.services.run_orchestrator import get_orchestrator
 from ocr_pipeline.services.startup import model_readiness
+from ocr_pipeline.services.text_bundle_exporter import TextBundleExporter
 
 
 router = APIRouter()
@@ -404,6 +405,68 @@ async def download_ocr_pairs(
     )
 
 
+@router.get("/api/runs/{run_id}/text-bundle/download")
+async def download_text_bundle(
+    run_id: str = ID,
+    document_ids: str | None = Query(None),
+):
+    db = get_db()
+    run = await _require_run(run_id)
+    if run["status"] != "completed":
+        raise HTTPException(status_code=409, detail="Run is not yet completed")
+
+    documents = await db.list_run_documents(run_id)
+    selected_ids = {
+        part.strip() for part in (document_ids or "").split(",") if part.strip()
+    } or None
+    if selected_ids:
+        available_ids = {doc["document_id"] for doc in documents}
+        missing = selected_ids - available_ids
+        if missing:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Document not found in run: {sorted(missing)[0]}",
+            )
+
+    pages_by_document = {
+        doc["document_id"]: await db.list_pages(run_id, doc["document_id"])
+        for doc in documents
+    }
+    catalog_by_document = {
+        doc["document_id"]: await db.get_document(doc["document_id"]) or {}
+        for doc in documents
+    }
+    scope = "all"
+    if selected_ids:
+        scope = hashlib.sha256(
+            ",".join(sorted(selected_ids)).encode("utf-8")
+        ).hexdigest()[:12]
+    export_dir = settings.runs_dir / run_id / "dataset" / f"text_bundle_{scope}"
+    cached_bundle = export_dir.with_suffix(".zip")
+    if cached_bundle.exists():
+        return FileResponse(
+            cached_bundle,
+            media_type="application/zip",
+            filename=f"{run_id}-text-bundle.zip",
+        )
+
+    result = await asyncio.to_thread(
+        TextBundleExporter(export_dir).export_run,
+        run=run,
+        documents=documents,
+        pages_by_document=pages_by_document,
+        catalog_by_document=catalog_by_document,
+        document_ids=selected_ids,
+    )
+    if result.pages_count == 0:
+        raise HTTPException(status_code=404, detail="No completed text to export")
+    return FileResponse(
+        result.bundle,
+        media_type="application/zip",
+        filename=f"{run_id}-text-bundle.zip",
+    )
+
+
 @router.get("/api/runs/{run_id}/stream")
 async def stream_run(run_id: str = ID, after_event_id: int = 0):
     await _require_run(run_id)
diff --git a/ocr_pipeline/services/text_bundle_exporter.py b/ocr_pipeline/services/text_bundle_exporter.py
new file mode 100644
index 0000000..e2fadd8
--- /dev/null
+++ b/ocr_pipeline/services/text_bundle_exporter.py
@@ -0,0 +1,250 @@
+import hashlib
+import json
+import re
+import shutil
+import tempfile
+import zipfile
+from dataclasses import dataclass
+from pathlib import Path
+
+from ocr_pipeline.config import settings
+from ocr_pipeline.services.dataset_exporter import PROJECT_METADATA
+from ocr_pipeline.services.output_writer import PAGE_BREAK
+
+
+@dataclass(frozen=True)
+class TextBundleExportResult:
+    export_dir: Path
+    bundle: Path
+    documents_count: int
+    pages_count: int
+
+
+class TextBundleExporter:
+    """Builds plain-text exports for corpus and NLP work."""
+
+    def __init__(self, export_dir: Path):
+        self.export_dir = export_dir
+
+    @staticmethod
+    def _read_text(path_str: str | None) -> str:
+        if not path_str:
+            return ""
+        path = Path(path_str)
+        return path.read_text(encoding="utf-8") if path.exists() else ""
+
+    @staticmethod
+    def _split_pages(text: str, total_pages: int) -> list[str]:
+        pages = text.split(PAGE_BREAK) if text else [""]
+        if len(pages) < total_pages:
+            pages.extend([""] * (total_pages - len(pages)))
+        return pages[:total_pages]
+
+    @staticmethod
+    def _json_list(raw) -> list[str]:
+        if isinstance(raw, list):
+            return [str(item) for item in raw]
+        if not raw:
+            return []
+        try:
+            value = json.loads(raw)
+        except (TypeError, json.JSONDecodeError):
+            return [part.strip() for part in str(raw).split(",") if part.strip()]
+        return [str(item) for item in value] if isinstance(value, list) else []
+
+    @staticmethod
+    def _language_list(value) -> list[str]:
+        if isinstance(value, list):
+            return [str(item).strip() for item in value if str(item).strip()]
+        if not value:
+            return []
+        return [part.strip() for part in str(value).split(",") if part.strip()]
+
+    @staticmethod
+    def _text_sha256(text: str) -> str:
+        return hashlib.sha256(text.encode("utf-8")).hexdigest()
+
+    @staticmethod
+    def _file_stem(filename: str, document_id: str) -> str:
+        stem = Path(filename or document_id).stem or document_id
+        safe = re.sub(r"[^A-Za-z0-9._-]+", "_", stem).strip("._-")
+        return f"{safe or 'document'}__{document_id[:8]}"
+
+    def export_run(
+        self,
+        *,
+        run: dict,
+        documents: list[dict],
+        pages_by_document: dict[str, list[dict]],
+        catalog_by_document: dict[str, dict],
+        document_ids: set[str] | None = None,
+    ) -> TextBundleExportResult:
+        tmp_parent = self.export_dir.parent
+        tmp_parent.mkdir(parents=True, exist_ok=True)
+        tmp_path = Path(
+            tempfile.mkdtemp(prefix=f"{self.export_dir.name}.", dir=tmp_parent)
+        )
+        clean_dir = tmp_path / "clean"
+        raw_dir = tmp_path / "raw"
+        clean_dir.mkdir()
+        raw_dir.mkdir()
+
+        page_rows: list[dict] = []
+        document_rows: list[dict] = []
+
+        for doc in documents:
+            document_id = doc["document_id"]
+            if doc.get("status") != "completed":
+                continue
+            if document_ids is not None and document_id not in document_ids:
+                continue
+
+            total_pages = int(
+                doc.get("total_pages")
+                or len(pages_by_document.get(document_id, []))
+                or 0
+            )
+            raw_text = self._read_text(doc.get("artifact_raw_txt"))
+            clean_text = self._read_text(doc.get("artifact_clean_txt"))
+            if not raw_text and not clean_text:
+                continue
+
+            stem = self._file_stem(doc.get("document_filename") or "", document_id)
+            raw_rel = f"raw/{stem}.txt"
+            clean_rel = f"clean/{stem}.txt"
+            (tmp_path / raw_rel).write_text(raw_text, encoding="utf-8")
+            (tmp_path / clean_rel).write_text(clean_text, encoding="utf-8")
+
+            raw_pages = self._split_pages(raw_text, total_pages)
+            clean_pages = self._split_pages(clean_text, total_pages)
+            page_meta = {
+                row["page_num"]: row for row in pages_by_document.get(document_id, [])
+            }
+            catalog = catalog_by_document.get(document_id, {})
+            language = self._language_list(catalog.get("language"))
+
+            for page_num in range(1, total_pages + 1):
+                meta = page_meta.get(page_num, {})
+                page_raw = raw_pages[page_num - 1]
+                page_clean = clean_pages[page_num - 1]
+                page_rows.append(
+                    {
+                        "id": f"{document_id}_page_{page_num:04d}",
+                        "run_id": run["id"],
+                        "document_id": document_id,
+                        "document_name": doc.get("document_filename"),
+                        "group_path": catalog.get("group_path"),
+                        "title": catalog.get("display_title")
+                        or catalog.get("pdf_title"),
+                        "author": catalog.get("author") or catalog.get("pdf_author"),
+                        "work": catalog.get("work"),
+                        "book": catalog.get("book"),
+                        "document_date_label": catalog.get("document_date_label"),
+                        "document_date_precision": catalog.get(
+                            "document_date_precision"
+                        ),
+                        "language": language
+                        or self._json_list(meta.get("detected_languages")),
+                        "script": catalog.get("script")
+                        or meta.get("primary_script"),
+                        "page": page_num,
+                        "raw_text": page_raw,
+                        "clean_text": page_clean,
+                        "raw_text_sha256": self._text_sha256(page_raw),
+                        "clean_text_sha256": self._text_sha256(page_clean),
+                        "ocr_status": meta.get("status"),
+                        "validation_issues": self._json_list(
+                            meta.get("validation_issues")
+                        ),
+                        "extraction_mode": meta.get("extraction_mode"),
+                        "extraction_attempt": meta.get("extraction_attempt"),
+                        "source_file": doc.get("document_filename"),
+                        "source_pdf_sha256": doc.get("file_sha256"),
+                        "ocr_model": run.get("model_used") or settings.model_name,
+                        "pipeline_version": run.get("pipeline_version")
+                        or settings.pipeline_version,
+                    }
+                )
+
+            document_rows.append(
+                {
+                    "run_id": run["id"],
+                    "document_id": document_id,
+                    "document_name": doc.get("document_filename"),
+                    "group_path": catalog.get("group_path"),
+                    "title": catalog.get("display_title")
+                    or catalog.get("pdf_title"),
+                    "author": catalog.get("author") or catalog.get("pdf_author"),
+                    "work": catalog.get("work"),
+                    "book": catalog.get("book"),
+                    "document_date_label": catalog.get("document_date_label"),
+                    "document_date_precision": catalog.get("document_date_precision"),
+                    "language": language,
+                    "script": catalog.get("script"),
+                    "page_count": total_pages,
+                    "raw_file": raw_rel,
+                    "clean_file": clean_rel,
+                    "raw_text_sha256": self._text_sha256(raw_text),
+                    "clean_text_sha256": self._text_sha256(clean_text),
+                    "source_pdf_sha256": doc.get("file_sha256"),
+                    "ocr_model": run.get("model_used") or settings.model_name,
+                    "pipeline_version": run.get("pipeline_version")
+                    or settings.pipeline_version,
+                }
+            )
+
+        self._write_jsonl(tmp_path / "pages.jsonl", page_rows)
+        self._write_jsonl(tmp_path / "documents.jsonl", document_rows)
+        self._write_manifest(tmp_path, run, len(document_rows), len(page_rows))
+
+        if self.export_dir.exists():
+            shutil.rmtree(self.export_dir)
+        tmp_path.replace(self.export_dir)
+
+        bundle = self.export_dir.with_suffix(".zip")
+        if bundle.exists():
+            bundle.unlink()
+        with zipfile.ZipFile(bundle, "w", compression=zipfile.ZIP_DEFLATED) as archive:
+            for path in sorted(self.export_dir.rglob("*")):
+                if path.is_file():
+                    archive.write(path, arcname=path.relative_to(self.export_dir))
+
+        return TextBundleExportResult(
+            export_dir=self.export_dir,
+            bundle=bundle,
+            documents_count=len(document_rows),
+            pages_count=len(page_rows),
+        )
+
+    @staticmethod
+    def _write_jsonl(path: Path, rows: list[dict]) -> None:
+        path.write_text(
+            "".join(json.dumps(row, ensure_ascii=False) + "\n" for row in rows),
+            encoding="utf-8",
+        )
+
+    @staticmethod
+    def _write_manifest(
+        export_dir: Path, run: dict, documents_count: int, pages_count: int
+    ) -> None:
+        payload = {
+            "export_type": "text_bundle",
+            "run_id": run["id"],
+            "created_by": PROJECT_METADATA,
+            "documents_count": documents_count,
+            "pages_count": pages_count,
+            "schema_version": 1,
+            "artifacts": {
+                "clean_text_dir": "clean/",
+                "raw_text_dir": "raw/",
+                "pages_jsonl": "pages.jsonl",
+                "documents_jsonl": "documents.jsonl",
+            },
+            "ocr_model": run.get("model_used") or settings.model_name,
+            "pipeline_version": run.get("pipeline_version")
+            or settings.pipeline_version,
+        }
+        (export_dir / "manifest.json").write_text(
+            json.dumps(payload, indent=2, ensure_ascii=False),
+            encoding="utf-8",
+        )
diff --git a/ocr_pipeline/static/index.html b/ocr_pipeline/static/index.html
index 7f77bcc..4c5a221 100644
--- a/ocr_pipeline/static/index.html
+++ b/ocr_pipeline/static/index.html
@@ -266,6 +266,9 @@ <h2>Run <code x-text="selectedRun?.id"></code></h2>
         <button class="btn btn-ghost" @click="downloadOCRPairs">
           <span x-text="selectedRunDocumentIds.length ? `↓ OCR pairs (${selectedRunDocumentIds.length})` : '↓ OCR pairs (zip)'"></span>
         </button>
+        <button class="btn btn-ghost" @click="downloadTextBundle">
+          <span x-text="selectedRunDocumentIds.length ? `↓ Text bundle (${selectedRunDocumentIds.length})` : '↓ Text bundle (zip)'"></span>
+        </button>
         <button class="btn btn-primary"
                 @click="openHFModal"
                 :disabled="!canPublish"
diff --git a/ocr_pipeline/static/js/api.js b/ocr_pipeline/static/js/api.js
index 2150f0f..9e507f7 100644
--- a/ocr_pipeline/static/js/api.js
+++ b/ocr_pipeline/static/js/api.js
@@ -158,6 +158,13 @@ const API = {
     return `/api/runs/${encodeURIComponent(runId)}/ocr-pairs/download?${params.toString()}`;
   },
 
+  textBundleDownloadUrl(runId, { documentIds = [] } = {}) {
+    const params = new URLSearchParams();
+    if (documentIds.length > 0) params.set('document_ids', documentIds.join(','));
+    const suffix = params.toString() ? `?${params.toString()}` : '';
+    return `/api/runs/${encodeURIComponent(runId)}/text-bundle/download${suffix}`;
+  },
+
   async publishToHF(runId, payload) {
     const res = await fetch(`/api/runs/${encodeURIComponent(runId)}/publish/hf`, {
       method: 'POST',
diff --git a/ocr_pipeline/static/js/app.js b/ocr_pipeline/static/js/app.js
index c94e396..5eaf93b 100644
--- a/ocr_pipeline/static/js/app.js
+++ b/ocr_pipeline/static/js/app.js
@@ -522,6 +522,15 @@ function opencrApp() {
       }
     },
 
+    downloadTextBundle() {
+      if (this.selectedRunId) {
+        this._download(API.textBundleDownloadUrl(
+          this.selectedRunId,
+          { documentIds: this.selectedRunDocumentIds },
+        ));
+      }
+    },
+
     openHFModal() {
       if (!this.selectedRunId) return;
       if (!this.canPublish) {
diff --git a/tests/test_text_bundle_exporter.py b/tests/test_text_bundle_exporter.py
new file mode 100644
index 0000000..60e064a
--- /dev/null
+++ b/tests/test_text_bundle_exporter.py
@@ -0,0 +1,155 @@
+import json
+import zipfile
+
+from ocr_pipeline.services.output_writer import OutputWriter
+from ocr_pipeline.services.run_storage import RunStorage
+from ocr_pipeline.services.text_bundle_exporter import TextBundleExporter
+from tests.test_output_writer import build_document, build_script
+
+
+def _rows(path):
+    return [
+        json.loads(line)
+        for line in path.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+
+
+def test_text_bundle_export_writes_raw_clean_text_and_jsonl(tmp_path):
+    storage = RunStorage(output_root=tmp_path, runs_root=tmp_path / "runs")
+    storage.ensure_run_dirs("run-text")
+    document = build_document()
+    document_id = document.file_sha256[:16]
+    paths = storage.artifact_paths("run-text", document_id, document.filename)
+
+    OutputWriter().write_all(
+        paths=paths,
+        raw_pages_text=["raw page one", "raw page two"],
+        clean_pages_text=["clean page one", "clean page two"],
+        pages_metadata=document.pages,
+        pages_script=[build_script(), build_script()],
+        doc_metadata=document,
+    )
+
+    result = TextBundleExporter(storage.dataset_dir("run-text") / "text_bundle").export_run(
+        run={"id": "run-text", "model_used": "model", "pipeline_version": "2.0.0"},
+        documents=[
+            {
+                "document_id": document_id,
+                "document_filename": document.filename,
+                "status": "completed",
+                "total_pages": 2,
+                "file_sha256": document.file_sha256,
+                "artifact_raw_txt": str(paths.raw_txt),
+                "artifact_clean_txt": str(paths.clean_txt),
+            }
+        ],
+        pages_by_document={
+            document_id: [
+                {"page_num": 1, "status": "pass", "extraction_mode": "markdown"},
+                {"page_num": 2, "status": "warn", "extraction_mode": "free_ocr"},
+            ]
+        },
+        catalog_by_document={
+            document_id: {
+                "group_path": "Ottoman/Sample",
+                "author": "Tester",
+                "language": "ota-Latn,tr",
+            }
+        },
+    )
+
+    assert result.documents_count == 1
+    assert result.pages_count == 2
+    assert result.bundle.exists()
+
+    clean_files = list((result.export_dir / "clean").glob("*.txt"))
+    raw_files = list((result.export_dir / "raw").glob("*.txt"))
+    assert clean_files[0].read_text(encoding="utf-8") == paths.clean_txt.read_text(
+        encoding="utf-8"
+    )
+    assert raw_files[0].read_text(encoding="utf-8") == paths.raw_txt.read_text(
+        encoding="utf-8"
+    )
+
+    page_rows = _rows(result.export_dir / "pages.jsonl")
+    assert [row["clean_text"] for row in page_rows] == [
+        "clean page one",
+        "clean page two",
+    ]
+    assert page_rows[0]["raw_text"] == "raw page one"
+    assert page_rows[0]["group_path"] == "Ottoman/Sample"
+    assert page_rows[0]["language"] == ["ota-Latn", "tr"]
+    assert page_rows[1]["extraction_mode"] == "free_ocr"
+
+    document_rows = _rows(result.export_dir / "documents.jsonl")
+    assert document_rows[0]["clean_file"].startswith("clean/")
+    assert document_rows[0]["raw_file"].startswith("raw/")
+
+    manifest = json.loads((result.export_dir / "manifest.json").read_text("utf-8"))
+    assert manifest["export_type"] == "text_bundle"
+    assert manifest["created_by"]["organization"] == "cdli.ai"
+
+    with zipfile.ZipFile(result.bundle) as archive:
+        names = set(archive.namelist())
+    assert "manifest.json" in names
+    assert "pages.jsonl" in names
+    assert "documents.jsonl" in names
+    assert any(name.startswith("clean/") and name.endswith(".txt") for name in names)
+    assert any(name.startswith("raw/") and name.endswith(".txt") for name in names)
+
+
+def test_text_bundle_export_can_filter_selected_documents(tmp_path):
+    storage = RunStorage(output_root=tmp_path, runs_root=tmp_path / "runs")
+    storage.ensure_run_dirs("run-selected-text")
+    document = build_document()
+    paths = storage.artifact_paths("run-selected-text", "doc-a", document.filename)
+    OutputWriter().write_all(
+        paths=paths,
+        raw_pages_text=["raw page one", "raw page two"],
+        clean_pages_text=["clean page one", "clean page two"],
+        pages_metadata=document.pages,
+        pages_script=[build_script(), build_script()],
+        doc_metadata=document,
+    )
+
+    result = TextBundleExporter(
+        storage.dataset_dir("run-selected-text") / "text_bundle"
+    ).export_run(
+        run={
+            "id": "run-selected-text",
+            "model_used": "model",
+            "pipeline_version": "2.0.0",
+        },
+        documents=[
+            {
+                "document_id": "doc-a",
+                "document_filename": "a.pdf",
+                "status": "completed",
+                "total_pages": 2,
+                "file_sha256": "doc-a-sha",
+                "artifact_raw_txt": str(paths.raw_txt),
+                "artifact_clean_txt": str(paths.clean_txt),
+            },
+            {
+                "document_id": "doc-b",
+                "document_filename": "b.pdf",
+                "status": "completed",
+                "total_pages": 2,
+                "file_sha256": "doc-b-sha",
+                "artifact_raw_txt": str(paths.raw_txt),
+                "artifact_clean_txt": str(paths.clean_txt),
+            },
+        ],
+        pages_by_document={
+            "doc-a": [{"page_num": 1}, {"page_num": 2}],
+            "doc-b": [{"page_num": 1}, {"page_num": 2}],
+        },
+        catalog_by_document={"doc-a": {}, "doc-b": {}},
+        document_ids={"doc-a"},
+    )
+
+    rows = _rows(result.export_dir / "pages.jsonl")
+    assert result.documents_count == 1
+    assert result.pages_count == 2
+    assert {row["document_id"] for row in rows} == {"doc-a"}
diff --git a/tests/test_ui_routes.py b/tests/test_ui_routes.py
index 06f3c1f..6affaa4 100644
--- a/tests/test_ui_routes.py
+++ b/tests/test_ui_routes.py
@@ -161,12 +161,14 @@ def test_home_uses_document_workbench():
     assert "group_path" in html
     assert "OCR snapshot" in html
     assert "OCR pairs" in html
+    assert "Text bundle" in html
     assert "selectedDocumentIds" in app_js
     assert "availableDocumentGroups()" in app_js
     assert "filteredDocuments()" in app_js
     assert "groupedDocuments()" in app_js
     assert "applyBulkGroup()" in app_js
     assert "downloadOCRPairs()" in app_js
+    assert "downloadTextBundle()" in app_js
     assert "selectedRunDocumentIds" in app_js
     assert "documentProcessLabel" in app_js
     assert "retryRun()" in app_js

From 8cc10a72d62961e67df43a428160836163745818 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= <fatihburak@protonmail.com>
Date: Mon, 11 May 2026 23:50:20 +0300
Subject: [PATCH 13/15] feat: implement quality flag tracking for document
 pages and suppress noisy transformer generation warnings.

---
 ocr_pipeline/models/metadata.py               |  1 +
 ocr_pipeline/models/schemas.py                |  1 +
 ocr_pipeline/routers/runs.py                  |  1 +
 ocr_pipeline/services/batch_processor.py      | 10 ++-
 ocr_pipeline/services/db.py                   | 18 ++++-
 ocr_pipeline/services/local_ocr_engine.py     | 31 +++++++-
 ocr_pipeline/services/metadata_collector.py   |  1 +
 ocr_pipeline/services/output_validator.py     | 23 ++++++
 ocr_pipeline/services/run_orchestrator.py     |  6 +-
 ocr_pipeline/services/text_bundle_exporter.py | 37 +++++++++-
 ocr_pipeline/services/text_normalizer.py      | 22 ++++++
 ocr_pipeline/static/css/style.css             | 22 ++++++
 ocr_pipeline/static/index.html                | 15 ++++
 ocr_pipeline/static/js/app.js                 | 13 ++++
 tests/test_batch_processor.py                 | 29 ++++++++
 tests/test_document_catalog.py                | 35 +++++++++
 tests/test_local_ocr_engine.py                | 28 +++++++
 tests/test_text_bundle_exporter.py            | 73 +++++++++++++++++++
 tests/test_text_normalizer.py                 | 18 +++++
 tests/test_ui_routes.py                       |  2 +
 tests/test_validator.py                       | 25 +++++++
 21 files changed, 397 insertions(+), 14 deletions(-)
 create mode 100644 ocr_pipeline/services/text_normalizer.py
 create mode 100644 tests/test_batch_processor.py
 create mode 100644 tests/test_text_normalizer.py

diff --git a/ocr_pipeline/models/metadata.py b/ocr_pipeline/models/metadata.py
index 9724c27..59eee67 100644
--- a/ocr_pipeline/models/metadata.py
+++ b/ocr_pipeline/models/metadata.py
@@ -44,6 +44,7 @@ class PageMetadata:
     page_height: float
     image_count: int
     estimated_complexity: str
+    quality_flags: list[str] = field(default_factory=list)
 
 
 @dataclass
diff --git a/ocr_pipeline/models/schemas.py b/ocr_pipeline/models/schemas.py
index 0f42978..e7474e5 100644
--- a/ocr_pipeline/models/schemas.py
+++ b/ocr_pipeline/models/schemas.py
@@ -211,6 +211,7 @@ class PageSummary(BaseModel):
     page_num: int
     status: str
     validation_issues: list[str] = Field(default_factory=list)
+    quality_flags: list[str] = Field(default_factory=list)
     script_direction: Optional[str] = None
     primary_script: Optional[str] = None
     detected_languages: list[str] = Field(default_factory=list)
diff --git a/ocr_pipeline/routers/runs.py b/ocr_pipeline/routers/runs.py
index 2d90436..daeedab 100644
--- a/ocr_pipeline/routers/runs.py
+++ b/ocr_pipeline/routers/runs.py
@@ -130,6 +130,7 @@ def _bool(v):
         page_num=row["page_num"],
         status=row["status"],
         validation_issues=_parse_str_list(row.get("validation_issues")),
+        quality_flags=_parse_str_list(row.get("quality_flags")),
         script_direction=row.get("script_direction"),
         primary_script=row.get("primary_script"),
         detected_languages=_parse_str_list(row.get("detected_languages")),
diff --git a/ocr_pipeline/services/batch_processor.py b/ocr_pipeline/services/batch_processor.py
index e810a0d..340a907 100644
--- a/ocr_pipeline/services/batch_processor.py
+++ b/ocr_pipeline/services/batch_processor.py
@@ -33,7 +33,7 @@
 PAGE_DB_FIELDS = (
     "validation_issues", "script_direction", "primary_script", "detected_languages",
     "token_count_cl100k", "text_length_chars", "text_length_words",
-    "extraction_mode", "extraction_attempt", "dpi_used",
+    "extraction_mode", "extraction_attempt", "dpi_used", "quality_flags",
     "has_embedded_text", "is_image_only",
 )
 
@@ -62,7 +62,10 @@ def __init__(
         self.writer = OutputWriter()
         self.event_callback = event_callback
         self.strip_refs = strip_refs
-        self.page_concurrency = max(1, page_concurrency or settings.batch_concurrency)
+        default_concurrency = (
+            1 if settings.is_local_backend else settings.batch_concurrency
+        )
+        self.page_concurrency = max(1, page_concurrency or default_concurrency)
 
     async def _emit(self, event: dict) -> None:
         if self.event_callback:
@@ -188,9 +191,10 @@ async def process_document(
         run_id: str,
         document_id: str,
         file_sha256: str,
+        filename: str | None = None,
         artifact_paths: ArtifactPaths,
     ) -> DocumentMetadata:
-        filename = pdf_path.name
+        filename = filename or pdf_path.name
         file_size = (await asyncio.to_thread(pdf_path.stat)).st_size
         started_at = datetime.now(timezone.utc).isoformat()
 
diff --git a/ocr_pipeline/services/db.py b/ocr_pipeline/services/db.py
index 0987a79..44bd0a1 100644
--- a/ocr_pipeline/services/db.py
+++ b/ocr_pipeline/services/db.py
@@ -43,6 +43,10 @@
     "catalog_updated_at": "TEXT",
 }
 
+PAGE_METADATA_COLUMNS = {
+    "quality_flags": "TEXT",
+}
+
 
 SCHEMA = """
 CREATE TABLE IF NOT EXISTS runs (
@@ -138,6 +142,7 @@
     extraction_mode TEXT,
     extraction_attempt INTEGER,
     dpi_used INTEGER,
+    quality_flags TEXT,
     has_embedded_text INTEGER,
     is_image_only INTEGER,
     PRIMARY KEY (run_id, document_id, page_num),
@@ -204,12 +209,16 @@ def conn(self) -> aiosqlite.Connection:
 
     async def _migrate(self) -> None:
         """Apply additive migrations for existing local SQLite catalogs."""
-        async with self.conn.execute("PRAGMA table_info(documents)") as cur:
+        await self._ensure_columns("documents", DOCUMENT_METADATA_COLUMNS)
+        await self._ensure_columns("pages", PAGE_METADATA_COLUMNS)
+
+    async def _ensure_columns(self, table: str, columns: dict[str, str]) -> None:
+        async with self.conn.execute(f"PRAGMA table_info({table})") as cur:
             existing = {row["name"] for row in await cur.fetchall()}
-        for name, column_type in DOCUMENT_METADATA_COLUMNS.items():
+        for name, column_type in columns.items():
             if name not in existing:
                 await self.conn.execute(
-                    f"ALTER TABLE documents ADD COLUMN {name} {column_type}"
+                    f"ALTER TABLE {table} ADD COLUMN {name} {column_type}"
                 )
 
     @asynccontextmanager
@@ -608,11 +617,12 @@ async def upsert_page(
             "extraction_mode": None,
             "extraction_attempt": None,
             "dpi_used": None,
+            "quality_flags": None,
             "has_embedded_text": None,
             "is_image_only": None,
         }
         defaults.update(fields)
-        for list_key in ("validation_issues", "detected_languages"):
+        for list_key in ("validation_issues", "detected_languages", "quality_flags"):
             v = defaults.get(list_key)
             if isinstance(v, list):
                 defaults[list_key] = json.dumps(v, ensure_ascii=False)
diff --git a/ocr_pipeline/services/local_ocr_engine.py b/ocr_pipeline/services/local_ocr_engine.py
index c897dee..6c414fe 100644
--- a/ocr_pipeline/services/local_ocr_engine.py
+++ b/ocr_pipeline/services/local_ocr_engine.py
@@ -21,7 +21,8 @@
 import io
 import logging
 import tempfile
-from contextlib import redirect_stdout
+import warnings
+from contextlib import contextmanager, redirect_stdout
 from importlib.util import find_spec
 from pathlib import Path
 from typing import Any
@@ -40,6 +41,12 @@
     "figure": "<image>\nParse the figure.",
 }
 
+NOISY_GENERATION_MESSAGES = (
+    r"`do_sample` is set to `False`.*`temperature` is set",
+    r"The attention mask and the pad token id were not set",
+    r"Setting `pad_token_id` to `eos_token_id`",
+)
+
 
 def _resolve_device(requested: str) -> str:
     if requested != "auto":
@@ -90,6 +97,26 @@ def _resolve_attn_implementation(requested: str, device: str) -> str:
     return "eager"
 
 
+@contextmanager
+def _quiet_generation_noise():
+    """Hide repeated Transformers generation warnings emitted by remote code."""
+    noisy_loggers = [
+        logging.getLogger("transformers.generation.utils"),
+        logging.getLogger("transformers.generation.configuration_utils"),
+    ]
+    previous_disabled = [logger.disabled for logger in noisy_loggers]
+    with warnings.catch_warnings():
+        for message in NOISY_GENERATION_MESSAGES:
+            warnings.filterwarnings("ignore", message=message)
+        for logger in noisy_loggers:
+            logger.disabled = True
+        try:
+            yield
+        finally:
+            for logger, disabled in zip(noisy_loggers, previous_disabled):
+                logger.disabled = disabled
+
+
 class LocalOCREngine:
     """In-process DeepSeek-OCR inference via `transformers`.
 
@@ -209,7 +236,7 @@ def _infer_blocking(self, image: Image.Image, prompt: str) -> str:
             image.save(image_path, format="PNG")
 
             remote_stdout = io.StringIO()
-            with redirect_stdout(remote_stdout):
+            with redirect_stdout(remote_stdout), _quiet_generation_noise():
                 try:
                     result = self._model.infer(
                         self._tokenizer,
diff --git a/ocr_pipeline/services/metadata_collector.py b/ocr_pipeline/services/metadata_collector.py
index ec1231d..58d7360 100644
--- a/ocr_pipeline/services/metadata_collector.py
+++ b/ocr_pipeline/services/metadata_collector.py
@@ -70,6 +70,7 @@ def build_page_metadata(
             page_height=page_profile.height,
             image_count=page_profile.image_count,
             estimated_complexity=page_profile.estimated_complexity,
+            quality_flags=validation_result.metrics.get("quality_flags", []),
         )
 
     def extract_pdf_metadata(self, pdf_path: Path) -> dict:
diff --git a/ocr_pipeline/services/output_validator.py b/ocr_pipeline/services/output_validator.py
index 5d25d59..9c4f662 100644
--- a/ocr_pipeline/services/output_validator.py
+++ b/ocr_pipeline/services/output_validator.py
@@ -34,6 +34,7 @@ class OutputValidator:
 
     def validate(self, text: str, page_num: int) -> ValidationResult:
         issues: list[str] = []
+        quality_flags: list[str] = []
         metrics: dict = {}
 
         # Check 1: Empty / near-empty output
@@ -110,6 +111,27 @@ def validate(self, text: str, page_num: int) -> ValidationResult:
                 issues.append(f"Page {page_num}: model artifact detected")
                 break
 
+        # Check 6: Corpus quality signals. These are usually still usable OCR,
+        # but researchers need them visible before treating text as ground truth.
+        hyphen_breaks = re.findall(
+            r"(?iu)[^\W\d_]{2,}-\s*\n\s*[^\W\d_]{2,}", stripped
+        )
+        inline_hyphen_breaks = re.findall(
+            r"(?iu)[^\W\d_]{2,}-\s{1,3}[^\W\d_]{2,}", stripped
+        )
+        hyphen_break_count = len(hyphen_breaks) + len(inline_hyphen_breaks)
+        if hyphen_break_count:
+            quality_flags.append("line_hyphenation")
+            metrics["line_hyphenation_count"] = hyphen_break_count
+            issues.append(
+                f"Page {page_num}: line-break hyphenation remains "
+                f"({hyphen_break_count} segment(s))"
+            )
+
+        if re.search(r"</?(center|div|span|html|body|table|tr|td|p|br|h[1-6])\b[^>]*>", stripped, re.I):
+            quality_flags.append("markup_leak")
+            issues.append(f"Page {page_num}: markup tag leaked into clean text")
+
         # Determine overall status
         if any("extreme repetition" in i or "model artifact" in i for i in issues):
             status = ValidationStatus.FAIL
@@ -120,5 +142,6 @@ def validate(self, text: str, page_num: int) -> ValidationResult:
         else:
             status = ValidationStatus.PASS
 
+        metrics["quality_flags"] = quality_flags
         metrics["text_length"] = len(stripped)
         return ValidationResult(status=status, issues=issues, metrics=metrics)
diff --git a/ocr_pipeline/services/run_orchestrator.py b/ocr_pipeline/services/run_orchestrator.py
index 45fa6b4..f023bbf 100644
--- a/ocr_pipeline/services/run_orchestrator.py
+++ b/ocr_pipeline/services/run_orchestrator.py
@@ -74,6 +74,7 @@ async def _stage_document(self, file_path: Path) -> StagedDocument:
         document_id = sha[:16]
         canonical = self.storage.source_pdf_path(document_id)
         existing = await self.db.get_document_by_sha(sha)
+        filename = existing["filename"] if existing else file_path.name
 
         if not canonical.exists():
             self.storage.sources_dir().mkdir(parents=True, exist_ok=True)
@@ -87,7 +88,7 @@ async def _stage_document(self, file_path: Path) -> StagedDocument:
 
         await self.db.upsert_document(
             document_id,
-            filename=file_path.name,
+            filename=filename,
             source_path=str(canonical),
             file_sha256=sha,
             file_size_bytes=size,
@@ -97,7 +98,7 @@ async def _stage_document(self, file_path: Path) -> StagedDocument:
         return StagedDocument(
             document_id=document_id,
             file_sha256=sha,
-            filename=file_path.name,
+            filename=filename,
             source_path=canonical,
             deduped=existing is not None,
             estimated_pages=page_count,
@@ -268,6 +269,7 @@ async def page_event(event: dict) -> None:
                     run_id=run_id,
                     document_id=staged.document_id,
                     file_sha256=staged.file_sha256,
+                    filename=staged.filename,
                     artifact_paths=paths,
                 )
                 documents_meta.append((staged.document_id, paths, doc_meta))
diff --git a/ocr_pipeline/services/text_bundle_exporter.py b/ocr_pipeline/services/text_bundle_exporter.py
index e2fadd8..be98ead 100644
--- a/ocr_pipeline/services/text_bundle_exporter.py
+++ b/ocr_pipeline/services/text_bundle_exporter.py
@@ -10,6 +10,7 @@
 from ocr_pipeline.config import settings
 from ocr_pipeline.services.dataset_exporter import PROJECT_METADATA
 from ocr_pipeline.services.output_writer import PAGE_BREAK
+from ocr_pipeline.services.text_normalizer import TextNormalizer
 
 
 @dataclass(frozen=True)
@@ -25,6 +26,7 @@ class TextBundleExporter:
 
     def __init__(self, export_dir: Path):
         self.export_dir = export_dir
+        self.normalizer = TextNormalizer()
 
     @staticmethod
     def _read_text(path_str: str | None) -> str:
@@ -67,7 +69,7 @@ def _text_sha256(text: str) -> str:
     @staticmethod
     def _file_stem(filename: str, document_id: str) -> str:
         stem = Path(filename or document_id).stem or document_id
-        safe = re.sub(r"[^A-Za-z0-9._-]+", "_", stem).strip("._-")
+        safe = re.sub(r"[^\w.-]+", "_", stem).strip("._-")
         return f"{safe or 'document'}__{document_id[:8]}"
 
     def export_run(
@@ -86,8 +88,10 @@ def export_run(
         )
         clean_dir = tmp_path / "clean"
         raw_dir = tmp_path / "raw"
+        normalized_dir = tmp_path / "normalized"
         clean_dir.mkdir()
         raw_dir.mkdir()
+        normalized_dir.mkdir()
 
         page_rows: list[dict] = []
         document_rows: list[dict] = []
@@ -106,27 +110,45 @@ def export_run(
             )
             raw_text = self._read_text(doc.get("artifact_raw_txt"))
             clean_text = self._read_text(doc.get("artifact_clean_txt"))
+            normalized_text = self.normalizer.normalize_for_nlp(clean_text)
             if not raw_text and not clean_text:
                 continue
 
-            stem = self._file_stem(doc.get("document_filename") or "", document_id)
+            catalog = catalog_by_document.get(document_id, {})
+            display_name = (
+                catalog.get("display_title")
+                or catalog.get("title")
+                or doc.get("document_filename")
+                or ""
+            )
+            stem = self._file_stem(display_name, document_id)
             raw_rel = f"raw/{stem}.txt"
             clean_rel = f"clean/{stem}.txt"
+            normalized_rel = f"normalized/{stem}.txt"
             (tmp_path / raw_rel).write_text(raw_text, encoding="utf-8")
             (tmp_path / clean_rel).write_text(clean_text, encoding="utf-8")
+            (tmp_path / normalized_rel).write_text(
+                normalized_text, encoding="utf-8"
+            )
 
             raw_pages = self._split_pages(raw_text, total_pages)
             clean_pages = self._split_pages(clean_text, total_pages)
+            normalized_pages = [
+                self.normalizer.normalize_for_nlp(page) for page in clean_pages
+            ]
             page_meta = {
                 row["page_num"]: row for row in pages_by_document.get(document_id, [])
             }
-            catalog = catalog_by_document.get(document_id, {})
             language = self._language_list(catalog.get("language"))
+            document_quality_flags: set[str] = set()
 
             for page_num in range(1, total_pages + 1):
                 meta = page_meta.get(page_num, {})
                 page_raw = raw_pages[page_num - 1]
                 page_clean = clean_pages[page_num - 1]
+                page_normalized = normalized_pages[page_num - 1]
+                quality_flags = self._json_list(meta.get("quality_flags"))
+                document_quality_flags.update(quality_flags)
                 page_rows.append(
                     {
                         "id": f"{document_id}_page_{page_num:04d}",
@@ -150,9 +172,14 @@ def export_run(
                         "page": page_num,
                         "raw_text": page_raw,
                         "clean_text": page_clean,
+                        "normalized_text": page_normalized,
                         "raw_text_sha256": self._text_sha256(page_raw),
                         "clean_text_sha256": self._text_sha256(page_clean),
+                        "normalized_text_sha256": self._text_sha256(
+                            page_normalized
+                        ),
                         "ocr_status": meta.get("status"),
+                        "quality_flags": quality_flags,
                         "validation_issues": self._json_list(
                             meta.get("validation_issues")
                         ),
@@ -184,8 +211,11 @@ def export_run(
                     "page_count": total_pages,
                     "raw_file": raw_rel,
                     "clean_file": clean_rel,
+                    "normalized_file": normalized_rel,
                     "raw_text_sha256": self._text_sha256(raw_text),
                     "clean_text_sha256": self._text_sha256(clean_text),
+                    "normalized_text_sha256": self._text_sha256(normalized_text),
+                    "quality_flags": sorted(document_quality_flags),
                     "source_pdf_sha256": doc.get("file_sha256"),
                     "ocr_model": run.get("model_used") or settings.model_name,
                     "pipeline_version": run.get("pipeline_version")
@@ -237,6 +267,7 @@ def _write_manifest(
             "artifacts": {
                 "clean_text_dir": "clean/",
                 "raw_text_dir": "raw/",
+                "normalized_text_dir": "normalized/",
                 "pages_jsonl": "pages.jsonl",
                 "documents_jsonl": "documents.jsonl",
             },
diff --git a/ocr_pipeline/services/text_normalizer.py b/ocr_pipeline/services/text_normalizer.py
new file mode 100644
index 0000000..7e14f31
--- /dev/null
+++ b/ocr_pipeline/services/text_normalizer.py
@@ -0,0 +1,22 @@
+import re
+
+
+class TextNormalizer:
+    """Conservative NLP-oriented normalization for exported clean text."""
+
+    _MARKUP_RE = re.compile(
+        r"</?(center|div|span|html|body|table|tr|td|p|br|h[1-6])\b[^>]*>",
+        re.I,
+    )
+    _LINE_HYPHEN_RE = re.compile(
+        r"(?iu)([^\W\d_]{2,})-\s*\n\s*([^\W\d_]{2,})"
+    )
+    _INLINE_HYPHEN_RE = re.compile(
+        r"(?iu)([^\W\d_]{2,})-\s{1,3}([^\W\d_]{2,})"
+    )
+
+    def normalize_for_nlp(self, text: str) -> str:
+        normalized = self._MARKUP_RE.sub("", text)
+        normalized = self._LINE_HYPHEN_RE.sub(r"\1\2", normalized)
+        normalized = self._INLINE_HYPHEN_RE.sub(r"\1\2", normalized)
+        return re.sub(r"\s+", " ", normalized).strip()
diff --git a/ocr_pipeline/static/css/style.css b/ocr_pipeline/static/css/style.css
index e06aaf4..f19bb42 100644
--- a/ocr_pipeline/static/css/style.css
+++ b/ocr_pipeline/static/css/style.css
@@ -618,6 +618,28 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
   font-family: var(--font-mono);
   color: var(--accent);
 }
+.page-quality {
+  display: grid;
+  gap: 6px;
+  padding: 10px 14px;
+  border-bottom: 1px solid var(--border);
+  color: var(--muted);
+  font-size: 0.75rem;
+}
+.page-quality div { display: flex; flex-wrap: wrap; align-items: center; gap: 6px; }
+.page-quality span {
+  font-weight: 700;
+  text-transform: uppercase;
+  letter-spacing: 0.06em;
+}
+.page-quality code {
+  max-width: 100%;
+  padding: 2px 6px;
+  border-radius: 6px;
+  background: rgba(115, 100, 82, 0.11);
+  color: var(--text);
+  overflow-wrap: anywhere;
+}
 .inspector-text pre {
   margin: 0;
   padding: 14px 16px;
diff --git a/ocr_pipeline/static/index.html b/ocr_pipeline/static/index.html
index 4c5a221..c76691b 100644
--- a/ocr_pipeline/static/index.html
+++ b/ocr_pipeline/static/index.html
@@ -355,6 +355,21 @@ <h3 x-text="inspector.document?.filename || ''"></h3>
         <span>OCR snapshot</span>
         <code x-text="`page ${inspector.pageNum}`"></code>
       </div>
+      <div class="page-quality"
+           x-show="currentPageQualityFlags().length || currentPageIssues().length">
+        <div x-show="currentPageQualityFlags().length">
+          <span>Quality flags</span>
+          <template x-for="flag in currentPageQualityFlags()" :key="flag">
+            <code x-text="flag"></code>
+          </template>
+        </div>
+        <div x-show="currentPageIssues().length">
+          <span>Issues</span>
+          <template x-for="issue in currentPageIssues()" :key="issue">
+            <code x-text="issue"></code>
+          </template>
+        </div>
+      </div>
       <pre x-text="selectedPageText() || '(empty)'"></pre>
     </div>
 
diff --git a/ocr_pipeline/static/js/app.js b/ocr_pipeline/static/js/app.js
index 5eaf93b..5ad9dae 100644
--- a/ocr_pipeline/static/js/app.js
+++ b/ocr_pipeline/static/js/app.js
@@ -216,6 +216,19 @@ function opencrApp() {
       return parts[this.inspector.pageNum - 1] || '';
     },
 
+    currentPageMeta() {
+      return (this.inspector.document?.pages || [])
+        .find(p => p.page_num === this.inspector.pageNum) || null;
+    },
+
+    currentPageQualityFlags() {
+      return this.currentPageMeta()?.quality_flags || [];
+    },
+
+    currentPageIssues() {
+      return this.currentPageMeta()?.validation_issues || [];
+    },
+
     pageStatusFor(pageNum) {
       return (this.inspector.document?.pages || []).find(p => p.page_num === pageNum)?.status || 'pending';
     },
diff --git a/tests/test_batch_processor.py b/tests/test_batch_processor.py
new file mode 100644
index 0000000..9b26881
--- /dev/null
+++ b/tests/test_batch_processor.py
@@ -0,0 +1,29 @@
+from ocr_pipeline.config import settings
+from ocr_pipeline.services.batch_processor import BatchProcessor
+
+
+def test_local_backend_defaults_to_single_page_concurrency(monkeypatch):
+    monkeypatch.setattr(settings, "model_backend", "local")
+    monkeypatch.setattr(settings, "batch_concurrency", 8)
+
+    processor = BatchProcessor(db=object())
+
+    assert processor.page_concurrency == 1
+
+
+def test_remote_backend_keeps_configured_page_concurrency(monkeypatch):
+    monkeypatch.setattr(settings, "model_backend", "vllm")
+    monkeypatch.setattr(settings, "batch_concurrency", 8)
+
+    processor = BatchProcessor(db=object())
+
+    assert processor.page_concurrency == 8
+
+
+def test_explicit_page_concurrency_overrides_local_default(monkeypatch):
+    monkeypatch.setattr(settings, "model_backend", "local")
+    monkeypatch.setattr(settings, "batch_concurrency", 8)
+
+    processor = BatchProcessor(db=object(), page_concurrency=2)
+
+    assert processor.page_concurrency == 2
diff --git a/tests/test_document_catalog.py b/tests/test_document_catalog.py
index 0df8a98..300740f 100644
--- a/tests/test_document_catalog.py
+++ b/tests/test_document_catalog.py
@@ -1,6 +1,9 @@
 import asyncio
+import hashlib
 
 from ocr_pipeline.services.db import Database
+from ocr_pipeline.services.run_orchestrator import RunOrchestrator
+from ocr_pipeline.services.run_storage import RunStorage
 
 
 def test_document_metadata_can_be_updated_and_listed(tmp_path):
@@ -74,3 +77,35 @@ async def _scenario():
             await db.close()
 
     asyncio.run(_scenario())
+
+
+def test_run_staging_preserves_catalog_filename_for_canonical_pdf(tmp_path):
+    async def _scenario():
+        db = Database(tmp_path / "opencr.sqlite")
+        await db.connect()
+        try:
+            storage = RunStorage(output_root=tmp_path, runs_root=tmp_path / "runs")
+            content = b"%PDF-1.4\n"
+            sha = hashlib.sha256(content).hexdigest()
+            document_id = sha[:16]
+            canonical = storage.source_pdf_path(document_id)
+            canonical.parent.mkdir(parents=True, exist_ok=True)
+            canonical.write_bytes(content)
+
+            await db.upsert_document(
+                document_id,
+                filename="YUNANISTAN-LA-BARIS-ANDLASMASI.pdf",
+                source_path=str(canonical),
+                file_sha256=sha,
+                file_size_bytes=len(content),
+            )
+
+            staged = await RunOrchestrator(db, storage)._stage_document(canonical)
+            doc = await db.get_document(document_id)
+
+            assert staged.filename == "YUNANISTAN-LA-BARIS-ANDLASMASI.pdf"
+            assert doc["filename"] == "YUNANISTAN-LA-BARIS-ANDLASMASI.pdf"
+        finally:
+            await db.close()
+
+    asyncio.run(_scenario())
diff --git a/tests/test_local_ocr_engine.py b/tests/test_local_ocr_engine.py
index 2e3964f..2d5e097 100644
--- a/tests/test_local_ocr_engine.py
+++ b/tests/test_local_ocr_engine.py
@@ -1,4 +1,6 @@
 import asyncio
+import logging
+import warnings
 
 import pytest
 from PIL import Image
@@ -97,3 +99,29 @@ def infer(self, tokenizer, **kwargs):
 
     assert result == "recognized text"
     assert "remote model debug noise" not in capsys.readouterr().out
+
+
+def test_local_infer_suppresses_repeated_generation_noise(capsys, caplog):
+    LocalOCREngine._instance = None
+    engine = LocalOCREngine()
+    engine._tokenizer = object()
+
+    class FakeModel:
+        def infer(self, tokenizer, **kwargs):
+            warnings.warn(
+                "`do_sample` is set to `False`. However, `temperature` is set to `0.0`",
+                stacklevel=1,
+            )
+            logging.getLogger("transformers.generation.utils").warning(
+                "The attention mask and the pad token id were not set."
+            )
+            return "recognized text"
+
+    engine._model = FakeModel()
+
+    with caplog.at_level(logging.WARNING):
+        result = engine._infer_blocking(Image.new("RGB", (8, 8)), "<image>\nFree OCR.")
+
+    assert result == "recognized text"
+    assert "temperature" not in capsys.readouterr().err
+    assert "attention mask" not in caplog.text
diff --git a/tests/test_text_bundle_exporter.py b/tests/test_text_bundle_exporter.py
index 60e064a..fb63e5f 100644
--- a/tests/test_text_bundle_exporter.py
+++ b/tests/test_text_bundle_exporter.py
@@ -65,12 +65,14 @@ def test_text_bundle_export_writes_raw_clean_text_and_jsonl(tmp_path):
 
     clean_files = list((result.export_dir / "clean").glob("*.txt"))
     raw_files = list((result.export_dir / "raw").glob("*.txt"))
+    normalized_files = list((result.export_dir / "normalized").glob("*.txt"))
     assert clean_files[0].read_text(encoding="utf-8") == paths.clean_txt.read_text(
         encoding="utf-8"
     )
     assert raw_files[0].read_text(encoding="utf-8") == paths.raw_txt.read_text(
         encoding="utf-8"
     )
+    assert normalized_files[0].exists()
 
     page_rows = _rows(result.export_dir / "pages.jsonl")
     assert [row["clean_text"] for row in page_rows] == [
@@ -78,6 +80,7 @@ def test_text_bundle_export_writes_raw_clean_text_and_jsonl(tmp_path):
         "clean page two",
     ]
     assert page_rows[0]["raw_text"] == "raw page one"
+    assert page_rows[0]["normalized_text"] == "clean page one"
     assert page_rows[0]["group_path"] == "Ottoman/Sample"
     assert page_rows[0]["language"] == ["ota-Latn", "tr"]
     assert page_rows[1]["extraction_mode"] == "free_ocr"
@@ -85,6 +88,7 @@ def test_text_bundle_export_writes_raw_clean_text_and_jsonl(tmp_path):
     document_rows = _rows(result.export_dir / "documents.jsonl")
     assert document_rows[0]["clean_file"].startswith("clean/")
     assert document_rows[0]["raw_file"].startswith("raw/")
+    assert document_rows[0]["normalized_file"].startswith("normalized/")
 
     manifest = json.loads((result.export_dir / "manifest.json").read_text("utf-8"))
     assert manifest["export_type"] == "text_bundle"
@@ -97,6 +101,9 @@ def test_text_bundle_export_writes_raw_clean_text_and_jsonl(tmp_path):
     assert "documents.jsonl" in names
     assert any(name.startswith("clean/") and name.endswith(".txt") for name in names)
     assert any(name.startswith("raw/") and name.endswith(".txt") for name in names)
+    assert any(
+        name.startswith("normalized/") and name.endswith(".txt") for name in names
+    )
 
 
 def test_text_bundle_export_can_filter_selected_documents(tmp_path):
@@ -153,3 +160,69 @@ def test_text_bundle_export_can_filter_selected_documents(tmp_path):
     assert result.documents_count == 1
     assert result.pages_count == 2
     assert {row["document_id"] for row in rows} == {"doc-a"}
+
+
+def test_text_bundle_prefers_catalog_title_for_file_names(tmp_path):
+    raw = tmp_path / "raw.txt"
+    clean = tmp_path / "clean.txt"
+    raw.write_text("raw", encoding="utf-8")
+    clean.write_text("clean", encoding="utf-8")
+
+    result = TextBundleExporter(tmp_path / "text_bundle").export_run(
+        run={"id": "run-title", "model_used": "model", "pipeline_version": "2.0.0"},
+        documents=[
+            {
+                "document_id": "0ff5673dc9672d0e",
+                "document_filename": "0ff5673dc9672d0e.pdf",
+                "status": "completed",
+                "total_pages": 1,
+                "file_sha256": "sha",
+                "artifact_raw_txt": str(raw),
+                "artifact_clean_txt": str(clean),
+            }
+        ],
+        pages_by_document={"0ff5673dc9672d0e": [{"page_num": 1}]},
+        catalog_by_document={
+            "0ff5673dc9672d0e": {
+                "display_title": "YUNANİSTAN'LA BARIŞ ANDLAŞMASI"
+            }
+        },
+    )
+
+    row = _rows(result.export_dir / "documents.jsonl")[0]
+    assert row["clean_file"].startswith("clean/YUNANİSTAN_LA_BARIŞ_ANDLAŞMASI")
+    assert not row["clean_file"].startswith("clean/0ff5673dc9672d0e")
+
+
+def test_text_bundle_exports_page_quality_flags(tmp_path):
+    raw = tmp_path / "raw.txt"
+    clean = tmp_path / "clean.txt"
+    raw.write_text("raw", encoding="utf-8")
+    clean.write_text("clean", encoding="utf-8")
+
+    result = TextBundleExporter(tmp_path / "text_bundle").export_run(
+        run={"id": "run-quality", "model_used": "model", "pipeline_version": "2.0.0"},
+        documents=[
+            {
+                "document_id": "doc-quality",
+                "document_filename": "doc.pdf",
+                "status": "completed",
+                "total_pages": 1,
+                "file_sha256": "sha",
+                "artifact_raw_txt": str(raw),
+                "artifact_clean_txt": str(clean),
+            }
+        ],
+        pages_by_document={
+            "doc-quality": [
+                {
+                    "page_num": 1,
+                    "quality_flags": '["line_hyphenation", "markup_leak"]',
+                }
+            ]
+        },
+        catalog_by_document={"doc-quality": {}},
+    )
+
+    row = _rows(result.export_dir / "pages.jsonl")[0]
+    assert row["quality_flags"] == ["line_hyphenation", "markup_leak"]
diff --git a/tests/test_text_normalizer.py b/tests/test_text_normalizer.py
new file mode 100644
index 0000000..6c05448
--- /dev/null
+++ b/tests/test_text_normalizer.py
@@ -0,0 +1,18 @@
+from ocr_pipeline.services.text_normalizer import TextNormalizer
+
+
+def test_normalizer_joins_line_break_hyphenation_for_nlp():
+    text = "Muahedenin müba-\ndelesi tarihinden itibaren geçerlidir."
+
+    normalized = TextNormalizer().normalize_for_nlp(text)
+
+    assert "mübade" in normalized
+    assert "-\n" not in normalized
+
+
+def test_normalizer_removes_basic_markup_for_nlp():
+    text = "<center>ANKARA</center>\nBu metin kullanılabilir."
+
+    normalized = TextNormalizer().normalize_for_nlp(text)
+
+    assert normalized == "ANKARA Bu metin kullanılabilir."
diff --git a/tests/test_ui_routes.py b/tests/test_ui_routes.py
index 6affaa4..c9cd3bd 100644
--- a/tests/test_ui_routes.py
+++ b/tests/test_ui_routes.py
@@ -174,4 +174,6 @@ def test_home_uses_document_workbench():
     assert "retryRun()" in app_js
     assert "Retry incomplete" in html
     assert "selectedPageText()" in app_js
+    assert "currentPageQualityFlags()" in app_js
+    assert "Quality flags" in html
     assert "saveSelectedDocument()" in app_js
diff --git a/tests/test_validator.py b/tests/test_validator.py
index bdeeaa4..a9d3249 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -82,6 +82,31 @@ def test_repeated_grounding_tokens(self, validator):
         assert result.status == ValidationStatus.FAIL
 
 
+class TestCorpusQualityFlags:
+    def test_line_break_hyphenation_warns_with_machine_flag(self, validator):
+        text = (
+            "Bu metin araştırma için yeterince uzun bir OCR çıktısıdır.\n"
+            "Muahedenin müba-\n"
+            "delesi tarihinden itibaren sekiz ay zarfında hitama erecektir."
+        )
+        result = validator.validate(text, page_num=1)
+
+        assert result.status == ValidationStatus.WARN
+        assert "line_hyphenation" in result.metrics["quality_flags"]
+        assert any("line-break hyphenation" in issue for issue in result.issues)
+
+    def test_markup_leak_warns_with_machine_flag(self, validator):
+        text = (
+            "<center>ANKARA</center>\n"
+            "Bu metin OCR motorunun temiz metne taşıdığı biçimlendirme "
+            "etiketini araştırmacıya görünür kılacak kadar uzundur."
+        )
+        result = validator.validate(text, page_num=1)
+
+        assert result.status == ValidationStatus.WARN
+        assert "markup_leak" in result.metrics["quality_flags"]
+
+
 class TestMetrics:
     def test_metrics_populated(self, validator):
         text = "Line 1\nLine 2\nLine 3\nLine 1\nLine 4\nLine 5\nLine 6\nLine 7\nLine 8\nLine 9"

From fa313c86516f9ed3379e2a1e44a8c8109587636b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= <fatihburak@protonmail.com>
Date: Tue, 12 May 2026 00:39:01 +0300
Subject: [PATCH 14/15] refactor: remove local transformers backend, migrate to
 DeepSeek-OCR-2, and establish GPU-first deployment via Docker Compose.

---
 CHANGELOG.md                                |  11 +
 CONTRIBUTING.md                             |   6 +-
 Makefile                                    |  17 +-
 README.md                                   |  42 +--
 README.tr.md                                |  15 +-
 docker-compose.yml                          |  30 +--
 docs/ottoman-turkish-ocr-research-brief.md  | 218 ++++++++++++++++
 ocr-model/Dockerfile                        |   2 +-
 ocr_pipeline/Dockerfile.cpu                 |  34 ---
 ocr_pipeline/config.py                      |  22 +-
 ocr_pipeline/models/schemas.py              |   2 -
 ocr_pipeline/routers/health.py              |   2 -
 ocr_pipeline/services/batch_processor.py    |   5 +-
 ocr_pipeline/services/hf_publisher.py       |   2 +-
 ocr_pipeline/services/local_ocr_engine.py   | 271 --------------------
 ocr_pipeline/services/metadata_collector.py |   2 +-
 ocr_pipeline/services/ocr_engine.py         |  24 +-
 ocr_pipeline/services/startup.py            |  58 -----
 ocr_pipeline/static/css/style.css           | 145 +++++++++++
 ocr_pipeline/static/index.html              |  98 ++++++-
 ocr_pipeline/static/js/app.js               |  81 +++++-
 requirements-local.txt                      |  15 --
 scripts/run_batch.py                        |   2 +-
 scripts/start.sh                            |  18 +-
 tests/test_batch_processor.py               |  15 +-
 tests/test_gpu_first_runtime.py             |  33 +++
 tests/test_local_ocr_engine.py              | 127 ---------
 tests/test_ocr_pair_exporter.py             |   2 +-
 tests/test_output_writer.py                 |   2 +-
 tests/test_requirements.py                  |  11 -
 tests/test_startup.py                       |  33 ---
 tests/test_ui_routes.py                     |  14 +
 32 files changed, 651 insertions(+), 708 deletions(-)
 create mode 100644 docs/ottoman-turkish-ocr-research-brief.md
 delete mode 100644 ocr_pipeline/Dockerfile.cpu
 delete mode 100644 ocr_pipeline/services/local_ocr_engine.py
 delete mode 100644 requirements-local.txt
 create mode 100644 tests/test_gpu_first_runtime.py
 delete mode 100644 tests/test_local_ocr_engine.py
 delete mode 100644 tests/test_requirements.py
 delete mode 100644 tests/test_startup.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1f29e9f..3d2a419 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,17 @@
 
 All notable changes to OpenCR are documented here. The format follows [Keep a Changelog](https://keepachangelog.com/), and the project follows [Semantic Versioning](https://semver.org/).
 
+## [Unreleased]
+
+### Changed
+
+- **Breaking:** OpenCR is GPU-first again. The in-process Apple Silicon / CPU
+  `MODEL_BACKEND=local` path, CPU Docker profile, and local `transformers`
+  dependency file were removed.
+- Default OCR model is now `deepseek-ai/DeepSeek-OCR-2`.
+- `docker compose up -d` now starts the NVIDIA/vLLM stack directly; no compose
+  profile is required.
+
 ## [v1.0.0]
 
 ### Added
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f429790..b25067a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -19,8 +19,8 @@ expected text is one of the highest-leverage contributions.
 PaddleOCR / Marker on a Turkish corpus and post the table — even 
 informal numbers are useful.
 
-- **Model-backend ports.** MLX, llama.cpp, ONNX, or any other runtime 
-that improves throughput on a target platform.
+- **Deployment recipes.** vLLM, hosted GPU endpoints, and reproducible
+benchmark environments that improve throughput or quality.
 
 - **Translations.** README and dataset cards in additional languages.
 
@@ -33,7 +33,7 @@ make install
 make test
 ```
 
-`make run` starts a local dev server on http://localhost:39672 with the `local` model backend (no GPU needed; ~5–30 s/page on M-series Macs).
+`make run` starts a local dev server on http://localhost:39672 and points it at `MODEL_SERVER_URL` (default: `http://localhost:39671`). Start the bundled GPU stack with `docker compose up -d`, or provide another OpenAI-compatible GPU endpoint.
 
 ## Code style
 
diff --git a/Makefile b/Makefile
index e73afcb..c2d6779 100644
--- a/Makefile
+++ b/Makefile
@@ -1,19 +1,20 @@
-.PHONY: help install install-local run run-remote test lint format docker-up docker-down clean
+.PHONY: help install run run-remote test lint format docker-up docker-down clean
 
 PY ?= python3
 VENV ?= .venv
 PIP := $(VENV)/bin/pip
 PYBIN := $(VENV)/bin/python
+MODEL_SERVER_URL ?= http://localhost:39671
 
 help:
 	@echo "OpenCR developer targets:"
-	@echo "  make install        # venv + base deps + local-backend deps (Mac/CPU friendly)"
-	@echo "  make run            # start dev server on http://localhost:39672 with the local backend"
+	@echo "  make install        # venv + base/dev deps"
+	@echo "  make run            # start dev server on http://localhost:39672, using MODEL_SERVER_URL"
 	@echo "  make run-remote     # start dev server pointing at MODEL_SERVER_URL"
 	@echo "  make test           # run pytest suite"
 	@echo "  make lint           # ruff check"
 	@echo "  make format         # ruff format"
-	@echo "  make docker-up      # docker compose up (NVIDIA GPU profile)"
+	@echo "  make docker-up      # docker compose up (NVIDIA GPU stack)"
 	@echo "  make docker-down    # docker compose down"
 
 $(VENV):
@@ -21,14 +22,14 @@ $(VENV):
 	$(PIP) install -U pip
 
 install: $(VENV)
-	$(PIP) install -r ocr_pipeline/requirements.txt -r requirements-local.txt
+	$(PIP) install -r ocr_pipeline/requirements.txt
 	$(PIP) install -r requirements-dev.txt
 
 run: $(VENV)
-	MODEL_BACKEND=local $(PYBIN) -m uvicorn ocr_pipeline.main:app --host 0.0.0.0 --port 39672 --reload
+	MODEL_BACKEND=remote MODEL_SERVER_URL=$(MODEL_SERVER_URL) $(PYBIN) -m uvicorn ocr_pipeline.main:app --host 0.0.0.0 --port 39672 --reload
 
 run-remote: $(VENV)
-	MODEL_BACKEND=remote $(PYBIN) -m uvicorn ocr_pipeline.main:app --host 0.0.0.0 --port 39672 --reload
+	MODEL_BACKEND=remote MODEL_SERVER_URL=$(MODEL_SERVER_URL) $(PYBIN) -m uvicorn ocr_pipeline.main:app --host 0.0.0.0 --port 39672 --reload
 
 test: $(VENV)
 	PYTHONPATH=. $(PYBIN) -m pytest -q
@@ -40,7 +41,7 @@ format: $(VENV)
 	$(VENV)/bin/ruff format ocr_pipeline tests scripts
 
 docker-up:
-	docker compose --profile gpu up -d
+	docker compose up -d
 
 docker-down:
 	docker compose down
diff --git a/README.md b/README.md
index ff3bc52..ffd3169 100644
--- a/README.md
+++ b/README.md
@@ -10,16 +10,16 @@ For Turkish documents, see: [README.tr.md](./README.tr.md)
 
 ## Why OpenCR?
 
-- **Turkish-first accuracy.** Built around DeepSeek-OCR, it handles Turkish characters and difficult page layouts better than off-the-shelf OCR.
+- **Turkish-first accuracy.** Built around DeepSeek-OCR-2, it handles Turkish characters and difficult page layouts better than off-the-shelf OCR.
 - **Dataset factory.** Outputs are packaged directly as `pages.parquet` + `documents.parquet` with deterministic train/validation/test splits and a HuggingFace dataset card.
 - **Operator console.** A single-page web UI to monitor runs, page-by-page validate quality, retry, and publish to HuggingFace.
-- **Pluggable backends.** Production-grade NVIDIA + vLLM by default; runs in-process on Apple Silicon / CPU for development; or talk to any OpenAI-compatible model server.
+- **GPU-first backend.** Production-grade NVIDIA + vLLM by default, with an optional remote mode for any OpenAI-compatible GPU model server.
 
 ---
 
 ## Quickstart
 
-### Option 1 — Docker (NVIDIA GPU, fastest path to inference)
+### Option 1 — Docker (NVIDIA GPU, primary path)
 
 Requires Docker, an NVIDIA GPU, and the NVIDIA Container Toolkit.
 
@@ -29,27 +29,10 @@ docker compose up -d
 
 Open http://localhost:39672. Drop PDFs in `./input/`, hit **Start OCR run**.
 
-### Option 2 — Apple Silicon / CPU (in-process inference, no GPU needed)
-
-For local development, demos, and small jobs on a Mac or Linux box with no GPU.
-
-```bash
-git clone https://github.com/cdliai/opencr.git
-cd opencr
-python3 -m venv .venv && source .venv/bin/activate
-pip install -r ocr_pipeline/requirements.txt -r requirements-local.txt
-MODEL_BACKEND=local ./scripts/start.sh
-```
-
-Open http://localhost:39672. The DeepSeek-OCR model (~6 GB) downloads 
-on first request and runs in-process via `transformers` on MPS (Apple Silicon) 
-or CPU. Expect **5–30 seconds per page on M-series, much slower on CPU** — 
-fine for development, not for production batch jobs.
-
-### Option 3 — Remote model server (point at any OpenAI-compatible endpoint)
+### Option 2 — Remote model server (point at any OpenAI-compatible endpoint)
 
 If you already run vLLM somewhere, or use OpenRouter, or another endpoint 
-serving DeepSeek-OCR:
+serving DeepSeek-OCR-2:
 
 ```bash
 pip install -r ocr_pipeline/requirements.txt
@@ -64,12 +47,10 @@ Configurable via environment variables (or a `.env` file):
 
 | Variable             | Default                          | Description                                                                                       |
 | -------------------- | -------------------------------- | ------------------------------------------------------------------------------------------------- |
-| `MODEL_BACKEND`      | `vllm`                           | `vllm` (NVIDIA, OpenAI-compatible server), `local` (in-process transformers), `remote` (alias).   |
+| `MODEL_BACKEND`      | `vllm`                           | `vllm` for the bundled NVIDIA model server, or `remote` for another OpenAI-compatible endpoint.   |
 | `MODEL_SERVER_URL`   | `http://ocr-model:39671`         | Base URL for `vllm` / `remote` backends.                                                          |
-| `MODEL_NAME`         | `deepseek-ai/DeepSeek-OCR`       | Model identifier.                                                                                 |
+| `MODEL_NAME`         | `deepseek-ai/DeepSeek-OCR-2`     | Model identifier.                                                                                 |
 | `MODEL_API_KEY`      | `EMPTY`                          | API key for remote endpoints.                                                                     |
-| `LOCAL_DEVICE`       | auto                             | `auto`, `mps`, `cuda`, or `cpu` for the `local` backend.                                          |
-| `LOCAL_ATTN_IMPLEMENTATION` | auto                      | `auto`, `eager`, `sdpa`, or `flash_attention_2`. Auto uses FlashAttention only when `flash_attn` is installed. |
 | `INPUT_DIR`          | `./input` (or `/data/input`)     | Where to read PDFs from.                                                                          |
 | `OUTPUT_DIR`         | `./output` (or `/data/output`)   | Where artifacts and the SQLite DB land.                                                           |
 | `HOST` / `PORT`      | `0.0.0.0` / `39672`              | Where the web console serves.                                                                     |
@@ -117,9 +98,8 @@ Published datasets are tagged `opencr` so they're discoverable via [HuggingFace'
                 ┌───────────────────────────────┐
                 │  Model backend                │
                 │  ┌─────────────────────────┐  │
-                │  │ vllm (NVIDIA, prod)     │  │
-                │  │ local (MPS/CPU, dev)    │  │
-                │  │ remote (any OpenAI URL) │  │
+                │  │ vLLM (NVIDIA, default)  │  │
+                │  │ remote (OpenAI URL)     │  │
                 │  └─────────────────────────┘  │
                 └───────────────────────────────┘
 ```
@@ -146,8 +126,8 @@ Tests live under `tests/`. UI is plain HTML + Alpine.js — no build step.
 ## Contributing
 
 Contributions are welcome — bug reports, Turkish-language 
-test fixtures, benchmarks against other OCR engines, model-backend 
-ports (MLX, llama.cpp), and documentation translations are 
+test fixtures, benchmarks against other OCR engines, deployment 
+recipes, and documentation translations are 
 especially useful. 
 
 See [CONTRIBUTING.md](./CONTRIBUTING.md).
diff --git a/README.tr.md b/README.tr.md
index 7c7ef0d..6905ff8 100644
--- a/README.tr.md
+++ b/README.tr.md
@@ -4,19 +4,19 @@ OpenCR, özellikle Türkçe metinler, arşiv dökümanları ve karmaşık sayfa
 
 ## Neden OpenCR?
 
-- **Türkçe Odaklı Doğruluk:** DeepSeek-OCR tabanlı yapısıyla, standart OCR araçlarının zorlandığı Türkçe karakterlerde ve karmaşık sayfa düzenlerinde üstün performans sağlar.
+- **Türkçe Odaklı Doğruluk:** DeepSeek-OCR-2 tabanlı yapısıyla, standart OCR araçlarının zorlandığı Türkçe karakterlerde ve karmaşık sayfa düzenlerinde güçlü bir başlangıç noktası sağlar.
 - **Veri Seti Fabrikası:** Çıkarılan metinleri doğrudan `.parquet` formatında paketler ve tek tıkla HuggingFace'e yüklemeye hazır hale getirir.
 - **Operatör Konsolu:** İşlemleri izlemek, sayfa sayfa kontrol etmek ve hataları düzeltmek için modern bir web arayüzü sunar.
 
 ## Kurulum
 
-### Docker ile Çalıştırma (GPU Gerekir)
+### Docker ile Çalıştırma (NVIDIA GPU Gerekir)
 ```bash
-docker-compose up -d
+docker compose up -d
 ```
 
-### Lokal Geliştirme ve Web Arayüzü (Apple Silicon / CPU)
-Pipeline arayüzünü Apple bilgisayarınızda veya CPU üzerinde denemek için:
+### Harici Model Sunucusu ile Geliştirme
+Zaten çalışan OpenAI-compatible bir vLLM / GPU endpoint'iniz varsa:
 
 1. **Klasör ve Ortam Hazırlığı:**
    ```bash
@@ -30,13 +30,12 @@ Pipeline arayüzünü Apple bilgisayarınızda veya CPU üzerinde denemek için:
    ```bash
    export INPUT_DIR="./input"
    export OUTPUT_DIR="./output"
-   export PYTHONPATH=$PYTHONPATH:.
-   python3 ocr_pipeline/main.py
+   MODEL_BACKEND=remote MODEL_SERVER_URL="https://your-endpoint" ./scripts/start.sh
    ```
    Erişim: **http://localhost:39672**
 
 ## Mimari
-- **Backend:** vLLM tabanlı DeepSeek-OCR (Ağır iş yükü).
+- **Backend:** vLLM tabanlı DeepSeek-OCR-2 (GPU-first).
 - **Frontend/API:** FastAPI & Alpine.js (Yönetim konsolu).
 
 ---
diff --git a/docker-compose.yml b/docker-compose.yml
index f5c3577..fdf433f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,13 +1,8 @@
-# Two profiles ship out of the box:
-#
-#   docker compose --profile gpu up -d   # production: vLLM model server + pipeline (NVIDIA)
-#   docker compose --profile cpu up -d   # CPU/Mac: pipeline only, in-process transformers backend
-#
-# Without an explicit --profile, no services run. Always pick one.
+# GPU-first OpenCR stack: vLLM model server + pipeline.
+# Requires Docker, an NVIDIA GPU, and the NVIDIA Container Toolkit.
 
 services:
   ocr-model:
-    profiles: ["gpu"]
     build: ./ocr-model
     runtime: nvidia
     restart: unless-stopped
@@ -39,7 +34,6 @@ services:
       start_period: 600s
 
   ocr-pipeline:
-    profiles: ["gpu"]
     build: ./ocr_pipeline
     restart: unless-stopped
     ports:
@@ -50,30 +44,12 @@ services:
     environment:
       - MODEL_BACKEND=vllm
       - MODEL_SERVER_URL=http://ocr-model:39671
+      - MODEL_NAME=deepseek-ai/DeepSeek-OCR-2
       - INPUT_DIR=/data/input
       - OUTPUT_DIR=/data/output
     depends_on:
       ocr-model:
         condition: service_healthy
 
-  ocr-pipeline-cpu:
-    profiles: ["cpu"]
-    build:
-      context: .
-      dockerfile: ocr_pipeline/Dockerfile.cpu
-    restart: unless-stopped
-    ports:
-      - "39672:39672"
-    volumes:
-      - ./input:/data/input
-      - ./output:/data/output
-      - hf-cache:/root/.cache/huggingface
-    environment:
-      - MODEL_BACKEND=local
-      - LOCAL_DEVICE=cpu
-      - INPUT_DIR=/data/input
-      - OUTPUT_DIR=/data/output
-      - HF_HOME=/root/.cache/huggingface
-
 volumes:
   hf-cache:
diff --git a/docs/ottoman-turkish-ocr-research-brief.md b/docs/ottoman-turkish-ocr-research-brief.md
new file mode 100644
index 0000000..37813bd
--- /dev/null
+++ b/docs/ottoman-turkish-ocr-research-brief.md
@@ -0,0 +1,218 @@
+# Ottoman and Turkish OCR Research Brief
+
+This note describes the current OpenCR pipeline and frames the research question we need help with: how to get more reliable OCR and structured text from Turkish, old Turkish, Latinized Ottoman, and Ottoman-script material without making the operator workflow heavy or over-engineered.
+
+## Current System
+
+OpenCR is a GPU-first OCR workbench. A user uploads or registers PDF documents, groups them, edits document-level metadata, runs OCR, inspects page images beside extracted text, and exports the result for corpus work or model training.
+
+The current OCR path is:
+
+1. Register PDFs in a document catalog.
+2. Store document metadata such as title, author, work, book, date label, date precision, language, script, license, citation, notes, and group path.
+3. Render each PDF page to an image.
+4. Send the page image to DeepSeek-OCR-2 through the vLLM/OpenAI-compatible backend.
+5. Clean the OCR text conservatively.
+6. Validate the page for obvious extraction failures and corpus-quality warnings.
+7. Store per-page metadata, quality flags, raw text, clean text, markdown, source PDFs, OCR image/text pairs, and text bundles.
+8. Export HuggingFace-friendly datasets.
+
+The system intentionally keeps the pipeline small. It does not try to silently "fix" historical text into modern Turkish. The goal is to preserve historical orthography, diacritics, transliteration choices, and document provenance.
+
+Relevant implementation areas:
+
+- `ocr_pipeline/services/batch_processor.py`: page rendering, OCR calls, retry strategy, validation, metadata collection.
+- `ocr_pipeline/services/ocr_engine.py`: OpenAI-compatible client for vLLM or another GPU model server.
+- `ocr_pipeline/services/output_validator.py`: page-level validation and quality flags.
+- `ocr_pipeline/services/text_cleaner.py`: conservative OCR cleanup.
+- `ocr_pipeline/services/text_normalizer.py`: optional NLP-oriented normalization for exported text only.
+- `ocr_pipeline/services/text_bundle_exporter.py`: raw, clean, and normalized text bundles.
+- `ocr_pipeline/services/ocr_pair_exporter.py`: page image plus text pairs for OCR model fine-tuning.
+- `ocr_pipeline/services/dataset_exporter.py`: HuggingFace-style page/document dataset export.
+
+## Model Context
+
+OpenCR now uses `deepseek-ai/DeepSeek-OCR-2` as the base model. The model is distributed on HuggingFace with Apache-2.0 license metadata, is listed as a 3B BF16 image-text-to-text model, and supports Free OCR plus grounded markdown prompts. The DeepSeek-OCR-2 paper introduces DeepEncoder V2 / Visual Causal Flow, where visual tokens can be reordered by document semantics instead of being forced through a fixed raster-scan order. That makes it a better first candidate for complex pages, but not a substitute for domain-specific Ottoman/Turkish benchmarking.
+
+The vLLM recipe for DeepSeek-OCR-2 documents OpenAI-compatible online serving with `vllm serve deepseek-ai/DeepSeek-OCR-2`, the custom `NGramPerReqLogitsProcessor`, disabled prefix caching, and multimodal image prompts. That matches OpenCR's GPU-first runtime direction.
+
+For Ottoman Turkish, the research risk is larger than plain OCR accuracy. Arabic-script Ottoman has right-to-left script behavior, ligatures, weak or ambiguous vowel representation, historical fonts, and no clean one-to-one mapping into modern Latin Turkish. Prior work on Ottoman periodicals emphasizes that transcription into a Latin writing system is itself a modeling choice, not merely a character-recognition task.
+
+## What We Already Preserve
+
+OpenCR keeps separate text layers:
+
+- `raw`: the model output after only minimal capture.
+- `clean`: conservative cleaned text for reading and corpus publication.
+- `normalized`: optional NLP-oriented text, currently used only in text-bundle exports.
+
+This split matters. For scholarly work, `clean` should remain close to what OCR produced. `normalized` can join broken line hyphenation, remove simple markup leaks, and make tokenization easier, but it must not replace the archive-facing text.
+
+OpenCR also stores:
+
+- source PDF SHA256,
+- source filename,
+- model name,
+- pipeline version,
+- extraction mode,
+- extraction attempt,
+- DPI,
+- page status,
+- validation issues,
+- quality flags,
+- language/script metadata,
+- project attribution through OpenCR/cdli.ai metadata.
+
+Current quality flags include visible corpus warnings such as line-break hyphenation and markup leakage. These are not the same as "OCR is wrong"; they mean the page should not be treated as ground truth without review.
+
+## Main Research Question
+
+How can we improve OCR accuracy and corpus usefulness for Turkish, old Turkish, Latinized Ottoman, and Ottoman-script documents while preserving historically meaningful forms and keeping the OpenCR workflow benchmarkable and repeatable?
+
+## Questions To Investigate
+
+1. Which DeepSeek-OCR-2 prompt and image settings work best for our material?
+
+   Compare Free OCR, grounded markdown, crop mode on/off, image sizes, and DPI values. Measure separately for Latinized Ottoman, modern Turkish Latin, Arabic-script Ottoman, tables, title pages, and degraded scans.
+
+2. When should we use OCR, HTR/ATR, or a specialist engine?
+
+   DeepSeek-OCR-2 is useful as a general vision-language OCR model. Kraken/eScriptorium-style ATR may be better for historical and non-Latin scripts when trained or fine-tuned on our domain. We should compare rather than assume one model is best.
+
+3. What is the right target text?
+
+   For Ottoman-script documents, there are at least three possible targets:
+
+   - diplomatic transcription preserving script-level details,
+   - scholarly transliteration,
+   - modern Turkish normalization.
+
+   These should be separate dataset columns or export layers, not one overwritten text field.
+
+4. Which errors are dangerous for scholarship?
+
+   Aggregate CER/WER is not enough. Historical OCR can silently modernize orthography, normalize rare forms, drop diacritics, or turn historically meaningful spelling into more common contemporary spelling. We need an error taxonomy.
+
+5. How much ground truth is enough?
+
+   Start with a small reviewed set: 20-50 pages chosen across document types, scan quality, script, date, and layout. Manually correct at page or line level, then calculate CER/WER and error categories.
+
+6. What should be corrected automatically?
+
+   Only low-risk transformations should be automatic in `normalized`: line-break hyphen joining, obvious markup removal, whitespace normalization, and page header/footer handling if confidence is high. Orthographic modernization should remain opt-in and separately labeled.
+
+## Evaluation Protocol
+
+Use a small gold set first, then expand only after the measurements are useful.
+
+Recommended first benchmark:
+
+- 10 pages: clean modern Turkish Latin print.
+- 10 pages: Latinized Ottoman / early Republican Turkish with extended Latin characters.
+- 10 pages: Arabic-script Ottoman print.
+- 5 pages: tables, treaties, or structured forms.
+- 5 pages: degraded scans or unusual typography.
+
+For every page, store:
+
+- page image,
+- source PDF,
+- OCR raw text,
+- OCR clean text,
+- human-reviewed text,
+- target convention used by the reviewer,
+- reviewer notes,
+- CER,
+- WER,
+- quality flags,
+- error categories.
+
+Important error categories:
+
+- character substitution,
+- dropped diacritic,
+- inserted diacritic,
+- word split,
+- word merge,
+- line-break hyphenation,
+- layout-order error,
+- header/footer leakage,
+- table structure loss,
+- script confusion,
+- Ottoman-to-modern normalization,
+- hallucinated word,
+- omitted text.
+
+## Practical Improvements Worth Trying
+
+These are useful without making OpenCR heavy.
+
+1. Add an "evaluation set" mode.
+
+   Let users mark selected pages as evaluation pages and attach reviewed text later. The pipeline can then calculate CER/WER and export a small benchmark bundle.
+
+2. Add extraction profiles.
+
+   Instead of many UI controls, define a few named profiles:
+
+   - `latin_print_fast`
+   - `latin_print_careful`
+   - `ottoman_arabic_print`
+   - `tables_and_forms`
+   - `fine_tune_pairs`
+
+   A profile can choose DPI, prompt, crop mode, and validation thresholds.
+
+3. Keep image/text pairs export central.
+
+   Fine-tuning needs page or line images matched with reviewed text. Current OCR pairs are useful, but the strongest version should include `reviewed_text`, `review_status`, and `target_convention`.
+
+4. Add script-aware validation.
+
+   If metadata says `script=latin_extended`, warn when the page is mostly Arabic script. If metadata says Ottoman Arabic script, warn when the extracted text is mostly Latin unless transliteration is the intended target.
+
+5. Add a no-silent-modernization rule.
+
+   Any step that changes historical spelling or transliteration must write to a new layer, not mutate `clean`.
+
+6. Add simple page-level layout labels.
+
+   `prose`, `title_page`, `table`, `mixed`, `index`, `blank`, and `image_only` would help researchers filter outputs and compare OCR modes.
+
+7. Store per-run extraction profile.
+
+   HuggingFace exports should say not only which model was used, but also which extraction profile, DPI, prompt mode, crop mode, and cleanup mode were used.
+
+## What Not To Do Yet
+
+Do not add a large automatic correction pipeline before we have ground truth. It would make the text look cleaner while hiding errors.
+
+Do not collapse Ottoman-script transcription, transliteration, and modern Turkish normalization into one field. They answer different scholarly questions.
+
+Do not judge model quality only from pages that "look readable." A page can be readable and still be bad for named entities, dates, legal terms, diacritics, or rare Ottoman forms.
+
+Do not publish a dataset as research-grade unless it has a reviewed subset and clear quality metadata.
+
+## Suggested Research Deliverable
+
+Ask the researcher to produce:
+
+1. A short survey of OCR/ATR options for Turkish, old Turkish, Latinized Ottoman, and Arabic-script Ottoman print.
+2. A recommended transcription target schema.
+3. A 20-50 page benchmark design.
+4. A CER/WER plus error-taxonomy evaluation plan.
+5. Recommended DeepSeek-OCR-2 profile settings to test.
+6. A proposal for when to use DeepSeek-OCR-2 versus Kraken/eScriptorium-style specialist ATR.
+7. A minimal metadata schema for HuggingFace publication that preserves source, script, date, model, pipeline, and review state.
+
+## Source Notes
+
+- DeepSeek-OCR-2 model card: https://huggingface.co/deepseek-ai/DeepSeek-OCR-2
+- DeepSeek-OCR-2 paper: https://arxiv.org/abs/2601.20552
+- vLLM DeepSeek-OCR-2 recipe: https://docs.vllm.ai/projects/recipes/en/latest/DeepSeek/DeepSeek-OCR-2.html
+- Ottoman Turkish periodical transcription case study: https://arxiv.org/abs/2011.01139
+- Kraken documentation: https://kraken.re/main/
+- Arabic-script OCR with Kraken case study: https://arxiv.org/abs/2402.10943
+- Historical OCR error-pattern study: https://arxiv.org/abs/2602.14524
+- Historical newspaper OCR ground-truth example: https://lab.kb.nl/dataset/historical-newspapers-ocr-ground-truth
+- Printed Ottoman Turkish OCR study: https://ideas.repec.org/a/tec/techni/v18y2023i1p47-64.html
diff --git a/ocr-model/Dockerfile b/ocr-model/Dockerfile
index 6cbbdde..f75ea02 100644
--- a/ocr-model/Dockerfile
+++ b/ocr-model/Dockerfile
@@ -1,7 +1,7 @@
 FROM vllm/vllm-openai:latest
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 CMD [ \
-    "--model", "deepseek-ai/DeepSeek-OCR", \
+    "--model", "deepseek-ai/DeepSeek-OCR-2", \
     "--trust-remote-code", \
     "--logits-processors", "vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor", \
     "--no-enable-prefix-caching", \
diff --git a/ocr_pipeline/Dockerfile.cpu b/ocr_pipeline/Dockerfile.cpu
deleted file mode 100644
index 468104d..0000000
--- a/ocr_pipeline/Dockerfile.cpu
+++ /dev/null
@@ -1,34 +0,0 @@
-# CPU-only image: pipeline + in-process transformers backend.
-# No vLLM, no NVIDIA runtime needed. Builds on any host.
-#
-# Build context is the repo root (set by docker-compose.yml) so we can pull in
-# requirements-local.txt alongside the pipeline source.
-#
-# The image is ~3 GB because it bundles torch + transformers; the model
-# weights themselves (~6 GB) download on first request and cache to the
-# hf-cache volume.
-FROM python:3.12-slim
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    poppler-utils \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-
-WORKDIR /app
-
-COPY ocr_pipeline/requirements.txt /tmp/requirements.txt
-COPY requirements-local.txt /tmp/requirements-local.txt
-# `--extra-index-url` pulls the CPU-only torch wheel.
-RUN pip install --no-cache-dir \
-      --extra-index-url https://download.pytorch.org/whl/cpu \
-      -r /tmp/requirements.txt -r /tmp/requirements-local.txt
-
-COPY ocr_pipeline /app/ocr_pipeline/
-
-ENV PYTHONPATH=/app
-ENV MODEL_BACKEND=local
-ENV LOCAL_DEVICE=cpu
-
-EXPOSE 39672
-
-CMD ["uvicorn", "ocr_pipeline.main:app", "--host", "0.0.0.0", "--port", "39672", "--no-access-log"]
diff --git a/ocr_pipeline/config.py b/ocr_pipeline/config.py
index d2b13b2..831459c 100644
--- a/ocr_pipeline/config.py
+++ b/ocr_pipeline/config.py
@@ -19,27 +19,19 @@ def _default_output_dir() -> Path:
 
 class Settings(BaseSettings):
     # Model backend selection
-    # - "vllm" / "remote": call any OpenAI-compatible /v1/chat/completions server
-    # - "local" / "transformers": load DeepSeek-OCR in-process via transformers (Mac/CPU)
-    model_backend: Literal["vllm", "remote", "local", "transformers"] = "vllm"
+    # - "vllm" / "remote": call any OpenAI-compatible /v1/chat/completions server.
+    # OpenCR is GPU-first; local Apple/CPU transformers inference is not shipped.
+    model_backend: Literal["vllm", "remote"] = "vllm"
     model_server_url: str = "http://ocr-model:39671"
-    model_name: str = "deepseek-ai/DeepSeek-OCR"
+    model_name: str = "deepseek-ai/DeepSeek-OCR-2"
     model_api_key: str = "EMPTY"
     model_timeout: float = 120.0
 
-    # Local backend (Apple Silicon / CPU)
-    local_device: Literal["auto", "mps", "cuda", "cpu"] = "auto"
-    local_dtype: Literal["auto", "float16", "bfloat16", "float32"] = "auto"
-    local_attn_implementation: Literal["auto", "eager", "sdpa", "flash_attention_2"] = (
-        "auto"
-    )
-    local_model_cache: Path = Path.home() / ".cache" / "huggingface"
-
     # Startup readiness (used by the remote backend)
     model_ready_timeout: int = 300
     model_ready_interval: int = 5
 
-    # NGram processor defaults (vLLM-only feature; ignored by local backend)
+    # NGram processor defaults for DeepSeek-OCR-2 on vLLM.
     ngram_size: int = 30
     window_size: int = 90
     whitelist_token_ids: list[int] = [128821, 128822]  # <td>, </td>
@@ -80,9 +72,5 @@ class Settings(BaseSettings):
 
     model_config = {"env_prefix": "", "case_sensitive": False, "extra": "ignore"}
 
-    @property
-    def is_local_backend(self) -> bool:
-        return self.model_backend in ("local", "transformers")
-
 
 settings = Settings()
diff --git a/ocr_pipeline/models/schemas.py b/ocr_pipeline/models/schemas.py
index e7474e5..8907197 100644
--- a/ocr_pipeline/models/schemas.py
+++ b/ocr_pipeline/models/schemas.py
@@ -74,8 +74,6 @@ class HealthResponse(BaseModel):
     model_status: str
     input_dir: str = ""
     output_dir: str = ""
-    local_model_cached: Optional[bool] = None
-    local_model_cache_dir: Optional[str] = None
 
 
 class FileInfo(BaseModel):
diff --git a/ocr_pipeline/routers/health.py b/ocr_pipeline/routers/health.py
index dd6a8d5..cad059c 100644
--- a/ocr_pipeline/routers/health.py
+++ b/ocr_pipeline/routers/health.py
@@ -21,8 +21,6 @@ async def health_check():
         model_status=model_readiness.status,
         input_dir=str(settings.input_dir),
         output_dir=str(settings.output_dir),
-        local_model_cached=model_readiness.local_model_cached,
-        local_model_cache_dir=model_readiness.local_model_cache_dir,
     )
     if not model_readiness.ready:
         return JSONResponse(content=resp.model_dump(), status_code=503)
diff --git a/ocr_pipeline/services/batch_processor.py b/ocr_pipeline/services/batch_processor.py
index 340a907..ac8bd88 100644
--- a/ocr_pipeline/services/batch_processor.py
+++ b/ocr_pipeline/services/batch_processor.py
@@ -62,10 +62,7 @@ def __init__(
         self.writer = OutputWriter()
         self.event_callback = event_callback
         self.strip_refs = strip_refs
-        default_concurrency = (
-            1 if settings.is_local_backend else settings.batch_concurrency
-        )
-        self.page_concurrency = max(1, page_concurrency or default_concurrency)
+        self.page_concurrency = max(1, page_concurrency or settings.batch_concurrency)
 
     async def _emit(self, event: dict) -> None:
         if self.event_callback:
diff --git a/ocr_pipeline/services/hf_publisher.py b/ocr_pipeline/services/hf_publisher.py
index 5be5609..f172a47 100644
--- a/ocr_pipeline/services/hf_publisher.py
+++ b/ocr_pipeline/services/hf_publisher.py
@@ -47,7 +47,7 @@ def _build_dataset_card(
         f"# OpenCR Dataset — Run `{run['id']}`",
         "",
         f"Generated by **[OpenCR](https://cdli.ai)** (cdli.ai) using "
-        f"`{run.get('model_used', 'DeepSeek-OCR')}` on "
+        f"`{run.get('model_used', 'DeepSeek-OCR-2')}` on "
         f"{run.get('completed_at') or run.get('created_at')}.",
         "",
         "## Summary",
diff --git a/ocr_pipeline/services/local_ocr_engine.py b/ocr_pipeline/services/local_ocr_engine.py
deleted file mode 100644
index 6c414fe..0000000
--- a/ocr_pipeline/services/local_ocr_engine.py
+++ /dev/null
@@ -1,271 +0,0 @@
-"""In-process OCR engine using HuggingFace `transformers`.
-
-Used by the `local` model backend so OpenCR runs on Apple Silicon, CPU-only
-boxes, and any environment without a GPU model server. Trades throughput for
-zero-deployment-friction: a single Python process boots the web UI and serves
-inference.
-
-Caveats:
-- DeepSeek-OCR is ~3B params + a vision tower; on M-series Macs expect
-  5–30 s/page, on CPU much slower. Production batch jobs should use vLLM.
-- The model loads lazily on the first extraction request so server startup
-  stays fast.
-- `transformers` and `torch` are intentionally optional — they only get
-  imported when this module is instantiated. Install them via
-  `requirements-local.txt`.
-"""
-
-from __future__ import annotations
-
-import asyncio
-import io
-import logging
-import tempfile
-import warnings
-from contextlib import contextmanager, redirect_stdout
-from importlib.util import find_spec
-from pathlib import Path
-from typing import Any
-
-from PIL import Image
-
-from ocr_pipeline.config import settings
-
-logger = logging.getLogger("ocr_pipeline.local_engine")
-
-# Maps the same `mode` strings the remote backend uses to the prompt strings
-# DeepSeek-OCR's reference inference helper expects.
-LOCAL_PROMPTS = {
-    "markdown": "<image>\n<|grounding|>Convert the document to markdown.",
-    "free_ocr": "<image>\nFree OCR.",
-    "figure": "<image>\nParse the figure.",
-}
-
-NOISY_GENERATION_MESSAGES = (
-    r"`do_sample` is set to `False`.*`temperature` is set",
-    r"The attention mask and the pad token id were not set",
-    r"Setting `pad_token_id` to `eos_token_id`",
-)
-
-
-def _resolve_device(requested: str) -> str:
-    if requested != "auto":
-        return requested
-    try:
-        import torch
-    except ImportError as exc:
-        raise RuntimeError(
-            "MODEL_BACKEND=local requires `torch`. Install with: "
-            "pip install -r requirements-local.txt"
-        ) from exc
-    if torch.cuda.is_available():
-        return "cuda"
-    if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
-        return "mps"
-    return "cpu"
-
-
-def _resolve_dtype(requested: str, device: str):
-    import torch
-
-    if requested == "float16":
-        return torch.float16
-    if requested == "bfloat16":
-        return torch.bfloat16
-    if requested == "float32":
-        return torch.float32
-    # auto — bf16 on CUDA, fp16 on MPS, fp32 on CPU (mps doesn't love bf16, cpu hates fp16)
-    if device == "cuda":
-        return torch.bfloat16
-    if device == "mps":
-        return torch.float16
-    return torch.float32
-
-
-def _resolve_attn_implementation(requested: str, device: str) -> str:
-    if requested != "auto":
-        if requested == "flash_attention_2" and find_spec("flash_attn") is None:
-            raise RuntimeError(
-                "LOCAL_ATTN_IMPLEMENTATION=flash_attention_2 requires `flash_attn`. "
-                "Install flash-attn, or unset LOCAL_ATTN_IMPLEMENTATION to use eager "
-                "attention."
-            )
-        return requested
-
-    if device == "cuda" and find_spec("flash_attn") is not None:
-        return "flash_attention_2"
-    return "eager"
-
-
-@contextmanager
-def _quiet_generation_noise():
-    """Hide repeated Transformers generation warnings emitted by remote code."""
-    noisy_loggers = [
-        logging.getLogger("transformers.generation.utils"),
-        logging.getLogger("transformers.generation.configuration_utils"),
-    ]
-    previous_disabled = [logger.disabled for logger in noisy_loggers]
-    with warnings.catch_warnings():
-        for message in NOISY_GENERATION_MESSAGES:
-            warnings.filterwarnings("ignore", message=message)
-        for logger in noisy_loggers:
-            logger.disabled = True
-        try:
-            yield
-        finally:
-            for logger, disabled in zip(noisy_loggers, previous_disabled):
-                logger.disabled = disabled
-
-
-class LocalOCREngine:
-    """In-process DeepSeek-OCR inference via `transformers`.
-
-    Only one instance is loaded per process; concurrent requests serialize on
-    the same model object via an asyncio lock since GPU/MPS memory makes
-    parallel calls impractical at this size.
-    """
-
-    _instance: "LocalOCREngine | None" = None
-
-    def __new__(cls, *args, **kwargs):
-        if cls._instance is None:
-            cls._instance = super().__new__(cls)
-        return cls._instance
-
-    def __init__(self, model_name: str | None = None) -> None:
-        if getattr(self, "_initialized", False):
-            return
-        self.model_name = model_name or settings.model_name
-        self._model: Any = None
-        self._tokenizer: Any = None
-        self._device: str | None = None
-        self._dtype: Any = None
-        self._lock = asyncio.Lock()
-        self._load_error: BaseException | None = None
-        self._initialized = True
-
-    async def _ensure_loaded(self) -> None:
-        if self._model is not None:
-            return
-        if self._load_error is not None:
-            raise RuntimeError("Local OCR engine failed to load") from self._load_error
-        async with self._lock:
-            if self._model is not None:
-                return
-            if self._load_error is not None:
-                raise RuntimeError(
-                    "Local OCR engine failed to load"
-                ) from self._load_error
-            try:
-                await asyncio.to_thread(self._load_blocking)
-            except Exception as exc:
-                self._load_error = exc
-                logger.error("Local OCR engine failed to load: %s", exc)
-                raise
-
-    def _load_blocking(self) -> None:
-        missing = [
-            package
-            for package in ("torch", "transformers", "tokenizers", "addict", "easydict")
-            if find_spec(package) is None
-        ]
-        if missing:
-            raise RuntimeError(
-                "MODEL_BACKEND=local missing package(s): "
-                f"{', '.join(missing)}. Install with: "
-                "pip install -r ocr_pipeline/requirements.txt -r requirements-local.txt"
-            )
-        from transformers import AutoModel, AutoTokenizer
-
-        device = _resolve_device(settings.local_device)
-        dtype = _resolve_dtype(settings.local_dtype, device)
-        logger.info(
-            "Loading %s on %s (%s). First boot downloads ~6 GB.",
-            self.model_name,
-            device,
-            dtype,
-        )
-
-        attn_impl = _resolve_attn_implementation(
-            settings.local_attn_implementation, device
-        )
-        logger.info("Using %s attention implementation.", attn_impl)
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            self.model_name,
-            trust_remote_code=True,
-            cache_dir=str(settings.local_model_cache),
-        )
-        model = AutoModel.from_pretrained(
-            self.model_name,
-            trust_remote_code=True,
-            use_safetensors=True,
-            attn_implementation=attn_impl,
-            cache_dir=str(settings.local_model_cache),
-        )
-        model = model.eval().to(dtype)
-        if device != "cpu":
-            model = model.to(device)
-
-        self._tokenizer = tokenizer
-        self._model = model
-        self._device = device
-        self._dtype = dtype
-        logger.info("Local OCR engine ready on %s", device)
-
-    async def extract_page(
-        self,
-        image: Image.Image,
-        mode: str = "markdown",
-        ngram_size: int | None = None,  # noqa: ARG002 (vLLM-only knob)
-        window_size: int | None = None,  # noqa: ARG002
-    ) -> str:
-        await self._ensure_loaded()
-        prompt = LOCAL_PROMPTS.get(mode, LOCAL_PROMPTS["markdown"])
-
-        async with self._lock:
-            return await asyncio.to_thread(self._infer_blocking, image, prompt)
-
-    def _infer_blocking(self, image: Image.Image, prompt: str) -> str:
-        # DeepSeek-OCR's `model.infer` (registered via trust_remote_code) expects a
-        # path on disk for the image and writes its result alongside it. We feed it
-        # a temp dir so nothing leaks into the output volume.
-        with tempfile.TemporaryDirectory() as tmpdir:
-            tmp = Path(tmpdir)
-            image_path = tmp / "page.png"
-            image.save(image_path, format="PNG")
-
-            remote_stdout = io.StringIO()
-            with redirect_stdout(remote_stdout), _quiet_generation_noise():
-                try:
-                    result = self._model.infer(
-                        self._tokenizer,
-                        prompt=prompt,
-                        image_file=str(image_path),
-                        output_path=str(tmp),
-                        base_size=1024,
-                        image_size=640,
-                        crop_mode=True,
-                        save_results=False,
-                        test_compress=False,
-                        eval_mode=True,
-                    )
-                except TypeError:
-                    # Older variants of the remote-code helper had a slightly
-                    # different signature; fall back to the minimal kwargs.
-                    result = self._model.infer(
-                        self._tokenizer,
-                        prompt=prompt,
-                        image_file=str(image_path),
-                        output_path=str(tmp),
-                    )
-
-            if remote_stdout.getvalue():
-                logger.debug("Suppressed verbose model stdout during local inference.")
-
-            if isinstance(result, str):
-                return result
-            # Some forks return a dict / list; prefer a 'text' key, else stringify.
-            if isinstance(result, dict) and "text" in result:
-                return str(result["text"])
-            return str(result) if result is not None else ""
diff --git a/ocr_pipeline/services/metadata_collector.py b/ocr_pipeline/services/metadata_collector.py
index 58d7360..5c8e30c 100644
--- a/ocr_pipeline/services/metadata_collector.py
+++ b/ocr_pipeline/services/metadata_collector.py
@@ -12,7 +12,7 @@
 class MetadataCollector:
     """Builds metadata during extraction."""
 
-    def __init__(self, model_name: str = "deepseek-ai/DeepSeek-OCR"):
+    def __init__(self, model_name: str = "deepseek-ai/DeepSeek-OCR-2"):
         self.model_name = model_name
         try:
             self._tokenizer = tiktoken.get_encoding("cl100k_base")
diff --git a/ocr_pipeline/services/ocr_engine.py b/ocr_pipeline/services/ocr_engine.py
index 494334d..3afa010 100644
--- a/ocr_pipeline/services/ocr_engine.py
+++ b/ocr_pipeline/services/ocr_engine.py
@@ -1,15 +1,9 @@
 """OCR engine abstraction.
 
-Two backends ship today:
-
-- `RemoteOCREngine` calls any OpenAI-compatible `/v1/chat/completions` endpoint
-  (vLLM serving DeepSeek-OCR is the production target; remote endpoints like
-  OpenRouter or a self-hosted shim work the same way).
-- `LocalOCREngine` runs DeepSeek-OCR in-process via `transformers`. Slow but
-  needs no GPU server — used for Apple Silicon / CPU development.
-
-Pick a backend with the `MODEL_BACKEND` env var. `OCREngine()` returns the
-right instance based on `settings.model_backend`.
+OpenCR is GPU-first: `RemoteOCREngine` calls an OpenAI-compatible
+`/v1/chat/completions` endpoint, with vLLM serving DeepSeek-OCR-2 as the
+default production target. `MODEL_BACKEND=remote` can point the same client at
+another compatible GPU service.
 """
 from __future__ import annotations
 
@@ -108,13 +102,5 @@ async def extract_page(
 
 
 def OCREngine(*args, **kwargs) -> _OCREngineProtocol:
-    """Factory. Returns the engine matching `settings.model_backend`.
-
-    Existing callers do `OCREngine()` so we keep this name as a callable.
-    """
-    if settings.is_local_backend:
-        # Imported lazily so projects without the local extras (transformers,
-        # torch) can still use the remote backend without import errors.
-        from ocr_pipeline.services.local_ocr_engine import LocalOCREngine
-        return LocalOCREngine(*args, **kwargs)
+    """Factory kept for existing callers."""
     return RemoteOCREngine(*args, **kwargs)
diff --git a/ocr_pipeline/services/startup.py b/ocr_pipeline/services/startup.py
index c68fa75..1c43f81 100644
--- a/ocr_pipeline/services/startup.py
+++ b/ocr_pipeline/services/startup.py
@@ -3,7 +3,6 @@
 import time
 
 import httpx
-from huggingface_hub import try_to_load_from_cache
 
 from ocr_pipeline.config import settings
 
@@ -18,15 +17,10 @@ def __init__(self):
         self.model_name: str | None = None
         self.error: str | None = None
         self.checked_at: float = 0
-        self.local_model_cached: bool | None = None
-        self.local_model_cache_dir: str | None = None
-        self.note: str | None = None
 
     @property
     def status(self) -> str:
         if self.ready:
-            if self.note:
-                return self.note
             return "ready"
         if self.error:
             return f"waiting ({self.error})"
@@ -36,63 +30,11 @@ def status(self) -> str:
 model_readiness = ModelReadiness()
 
 
-def _local_model_cache_files_present() -> bool:
-    required_files = (
-        "config.json",
-        "tokenizer_config.json",
-        "tokenizer.json",
-        "model.safetensors.index.json",
-    )
-    return all(
-        try_to_load_from_cache(
-            settings.model_name,
-            filename,
-            cache_dir=settings.local_model_cache,
-        )
-        for filename in required_files
-    )
-
-
-async def configure_local_readiness(
-    readiness: ModelReadiness = model_readiness,
-) -> bool:
-    cached = await asyncio.to_thread(_local_model_cache_files_present)
-    readiness.ready = True
-    readiness.model_name = settings.model_name
-    readiness.error = None
-    readiness.checked_at = time.time()
-    readiness.local_model_cached = cached
-    readiness.local_model_cache_dir = str(settings.local_model_cache)
-    readiness.note = (
-        "ready (local model cached)"
-        if cached
-        else ("ready (local model will download on first extraction)")
-    )
-    if cached:
-        logger.info(
-            "Local backend selected; model cache found at %s.",
-            settings.local_model_cache,
-        )
-    else:
-        logger.warning(
-            "Local backend selected; model is not fully cached at %s. "
-            "First extraction will download model files.",
-            settings.local_model_cache,
-        )
-    return True
-
-
 async def wait_for_model_server() -> bool:
     """
     Block until the model server is healthy and can list its model.
     Called once at pipeline startup. Returns True if ready, False if timed out.
-
-    For the in-process `local` backend there is nothing to wait for — the model
-    loads lazily on the first request — so we mark ready immediately.
     """
-    if settings.is_local_backend:
-        return await configure_local_readiness()
-
     base = settings.model_server_url
     timeout = settings.model_ready_timeout
     interval = settings.model_ready_interval
diff --git a/ocr_pipeline/static/css/style.css b/ocr_pipeline/static/css/style.css
index f19bb42..132ed6b 100644
--- a/ocr_pipeline/static/css/style.css
+++ b/ocr_pipeline/static/css/style.css
@@ -69,6 +69,35 @@ a { color: var(--accent); }
 .title-row { display: flex; align-items: baseline; gap: 10px; }
 .title-row h1 { margin: 0; font-size: 1.6rem; line-height: 1; }
 
+.view-nav {
+  display: flex;
+  align-items: center;
+  gap: 6px;
+  padding: 4px;
+  border: 1px solid var(--border);
+  border-radius: 999px;
+  background: var(--surface-strong);
+}
+
+.nav-tab {
+  min-height: 32px;
+  padding: 6px 12px;
+  border: 0;
+  border-radius: 999px;
+  background: transparent;
+  color: var(--muted);
+  font: inherit;
+  font-size: 0.84rem;
+  font-weight: 700;
+  cursor: pointer;
+}
+
+.nav-tab:hover,
+.nav-tab.active {
+  background: var(--accent-soft);
+  color: var(--accent);
+}
+
 .topbar-meta { display: flex; align-items: center; gap: 16px; }
 
 .metric-strip {
@@ -113,6 +142,7 @@ a { color: var(--accent); }
 }
 
 .console-grid.document-mode { grid-template-columns: 280px minmax(0, 1fr); }
+.console-grid.solo-mode { grid-template-columns: minmax(0, 1fr); }
 
 .rail, .stage, .inspector {
   border: 1px solid var(--border);
@@ -523,6 +553,116 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
 .doc-name { font-weight: 600; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
 .doc-meta { display: flex; gap: 6px; align-items: center; margin-top: 4px; font-size: 0.82rem; color: var(--muted); }
 
+/* ---------------- profiles + benchmarks ---------------- */
+
+.profile-console,
+.benchmark-console,
+.run-overview {
+  display: flex;
+  flex-direction: column;
+  min-height: 100%;
+}
+
+.profile-grid {
+  display: grid;
+  grid-template-columns: repeat(3, minmax(0, 1fr));
+  gap: 14px;
+  padding: 18px 24px 24px;
+}
+
+.profile-card,
+.benchmark-panel {
+  border: 1px solid var(--border);
+  border-radius: var(--radius-sm);
+  background: rgba(255, 255, 255, 0.7);
+  overflow: hidden;
+}
+
+.profile-card { padding: 14px 16px; }
+
+.profile-card-head {
+  display: flex;
+  justify-content: space-between;
+  align-items: flex-start;
+  gap: 12px;
+  margin-bottom: 12px;
+}
+
+.profile-card .eyebrow {
+  margin: 0 0 5px;
+  text-transform: uppercase;
+  letter-spacing: 0.12em;
+  color: var(--muted);
+  font-size: 0.66rem;
+  font-weight: 700;
+}
+
+.profile-card h3 {
+  margin: 0;
+  font: 0.94rem var(--font-mono);
+  overflow-wrap: anywhere;
+}
+
+.profile-specs {
+  display: grid;
+  gap: 8px;
+  margin: 0;
+}
+
+.profile-specs div {
+  display: grid;
+  grid-template-columns: 74px minmax(0, 1fr);
+  gap: 10px;
+  align-items: baseline;
+}
+
+.profile-specs dt {
+  color: var(--muted);
+  font-size: 0.72rem;
+  font-weight: 700;
+  text-transform: uppercase;
+  letter-spacing: 0.06em;
+}
+
+.profile-specs dd {
+  margin: 0;
+  min-width: 0;
+  font-size: 0.86rem;
+  overflow-wrap: anywhere;
+}
+
+.benchmark-panel {
+  display: grid;
+  margin: 18px 24px 24px;
+}
+
+.benchmark-row {
+  display: grid;
+  grid-template-columns: minmax(180px, 1.2fr) minmax(180px, 1fr) 90px 90px 120px;
+  align-items: center;
+  gap: 12px;
+  min-height: 48px;
+  padding: 9px 14px;
+  border-bottom: 1px solid var(--border);
+}
+
+.benchmark-row:last-child { border-bottom: 0; }
+
+.benchmark-row-head {
+  min-height: 36px;
+  background: rgba(115, 100, 82, 0.08);
+  color: var(--muted);
+  font-size: 0.72rem;
+  font-weight: 700;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+}
+
+.benchmark-row code {
+  font-family: var(--font-mono);
+  color: var(--accent);
+}
+
 /* ---------------- inspector ---------------- */
 
 .inspector { padding: 0; }
@@ -712,6 +852,7 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
 @media (max-width: 1280px) {
   .console-grid { grid-template-columns: 240px minmax(0, 1fr) 460px; }
   .console-grid.document-mode { grid-template-columns: 240px minmax(0, 1fr); }
+  .profile-grid { grid-template-columns: repeat(2, minmax(0, 1fr)); }
 }
 
 @media (max-width: 1080px) {
@@ -723,10 +864,14 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent);
   .inspector { max-height: 720px; }
   .metric-strip { display: none; }
   .document-workbench-body { grid-template-columns: 1fr; }
+  .profile-grid { grid-template-columns: 1fr; }
+  .benchmark-panel { overflow-x: auto; }
+  .benchmark-row { min-width: 760px; }
 }
 
 @media (max-width: 720px) {
   .topbar { flex-direction: column; align-items: stretch; gap: 10px; }
+  .view-nav { justify-content: space-between; overflow-x: auto; }
   .document-toolbar { flex-wrap: wrap; }
   .toolbar-search, .toolbar-group { max-width: none; width: 100%; }
   .document-row { grid-template-columns: 24px minmax(160px, 1fr) 72px 76px; }
diff --git a/ocr_pipeline/static/index.html b/ocr_pipeline/static/index.html
index c76691b..dade4c7 100644
--- a/ocr_pipeline/static/index.html
+++ b/ocr_pipeline/static/index.html
@@ -21,6 +21,12 @@
       <h1>OpenCR</h1>
     </div>
   </div>
+  <nav class="view-nav" aria-label="OpenCR sections">
+    <button class="nav-tab" :class="{ active: activeView === 'documents' }" @click="setActiveView('documents')">Documents</button>
+    <button class="nav-tab" :class="{ active: activeView === 'runs' }" @click="setActiveView('runs')">Runs</button>
+    <button class="nav-tab" :class="{ active: activeView === 'profiles' }" @click="setActiveView('profiles')">Profiles</button>
+    <button class="nav-tab" :class="{ active: activeView === 'benchmarks' }" @click="setActiveView('benchmarks')">Benchmarks</button>
+  </nav>
   <div class="topbar-meta">
     <div class="metric-strip">
       <div class="metric"><span class="metric-label">Active</span><span class="metric-value" x-text="metrics.active_jobs"></span></div>
@@ -45,10 +51,10 @@ <h1>OpenCR</h1>
   </div>
 </header>
 
-<main class="console-grid" :class="{ 'document-mode': !selectedRunId }">
+<main class="console-grid" :class="{ 'document-mode': activeView === 'runs' && !selectedRunId, 'solo-mode': activeView !== 'runs' }">
 
   <!-- Left rail: Runs sidebar -->
-  <aside class="rail">
+  <aside class="rail" x-show="activeView === 'runs'">
     <div class="rail-header">
       <h2>Runs</h2>
       <div class="rail-actions">
@@ -87,7 +93,7 @@ <h2>Runs</h2>
   <section class="stage">
 
     <!-- Document workbench when no run selected -->
-    <div class="document-workbench" x-show="!selectedRunId">
+    <div class="document-workbench" x-show="activeView === 'documents'">
       <header class="stage-header">
         <div>
           <h2>Documents</h2>
@@ -215,7 +221,21 @@ <h3 x-text="documentDraft.display_title || documentDraft.filename"></h3>
     </div>
 
     <!-- Run detail when a run is selected -->
-    <div class="run-detail" x-show="selectedRunId && selectedRun">
+    <div class="run-overview" x-show="activeView === 'runs' && !selectedRunId">
+      <header class="stage-header">
+        <div>
+          <h2>Runs</h2>
+          <p class="stage-sub">Select a run from the rail, or start a new extraction from Documents.</p>
+        </div>
+        <div class="stage-actions">
+          <button class="btn btn-ghost btn-sm" @click="refreshRuns">Refresh</button>
+          <button class="btn btn-primary btn-sm" @click="selectRun(null)">New run</button>
+        </div>
+      </header>
+      <p class="empty-note">Run history is managed from the rail.</p>
+    </div>
+
+    <div class="run-detail" x-show="activeView === 'runs' && selectedRunId && selectedRun">
       <header class="stage-header">
         <div>
           <h2>Run <code x-text="selectedRun?.id"></code></h2>
@@ -312,10 +332,76 @@ <h3 class="section-title">Documents</h3>
       </section>
     </div>
 
+    <div class="profile-console" x-show="activeView === 'profiles'">
+      <header class="stage-header">
+        <div>
+          <h2>OCR Profiles</h2>
+          <p class="stage-sub">Base model: <code>deepseek-ai/DeepSeek-OCR-2</code></p>
+        </div>
+        <div class="stage-actions">
+          <span class="pill pill-active">GPU-first</span>
+          <span class="pill pill-muted" x-text="currentModel"></span>
+        </div>
+      </header>
+
+      <section class="profile-grid">
+        <template x-for="profile in extractionProfiles" :key="profile.id">
+          <article class="profile-card">
+            <div class="profile-card-head">
+              <div>
+                <p class="eyebrow" x-text="profile.role"></p>
+                <h3 x-text="profile.id"></h3>
+              </div>
+              <span class="pill pill-sm" :class="profileStatusClass(profile.status)" x-text="profile.status"></span>
+            </div>
+            <dl class="profile-specs">
+              <div><dt>Engine</dt><dd x-text="profile.engine"></dd></div>
+              <div><dt>Model</dt><dd x-text="profile.model"></dd></div>
+              <div><dt>DPI</dt><dd x-text="profile.dpi"></dd></div>
+              <div><dt>Prompt</dt><dd x-text="profile.prompt"></dd></div>
+              <div><dt>Crop</dt><dd x-text="profile.crop"></dd></div>
+              <div><dt>Cleanup</dt><dd x-text="profile.cleanup"></dd></div>
+            </dl>
+          </article>
+        </template>
+      </section>
+    </div>
+
+    <div class="benchmark-console" x-show="activeView === 'benchmarks'">
+      <header class="stage-header">
+        <div>
+          <h2>Benchmarks</h2>
+          <p class="stage-sub">Same pages, same profiles, measured before choosing defaults per document type.</p>
+        </div>
+        <div class="stage-actions">
+          <span class="pill pill-warn">Evaluation pending</span>
+        </div>
+      </header>
+
+      <section class="benchmark-panel">
+        <div class="benchmark-row benchmark-row-head">
+          <span>Document type</span>
+          <span>Profile</span>
+          <span>CER</span>
+          <span>WER</span>
+          <span>Decision</span>
+        </div>
+        <template x-for="row in benchmarkRows" :key="`${row.documentType}-${row.profile}`">
+          <div class="benchmark-row">
+            <span x-text="row.documentType"></span>
+            <span x-text="row.profile"></span>
+            <code x-text="row.cer"></code>
+            <code x-text="row.wer"></code>
+            <span class="pill pill-sm" :class="row.best ? 'pill-success' : 'pill-muted'" x-text="row.decision"></span>
+          </div>
+        </template>
+      </section>
+    </div>
+
   </section>
 
   <!-- Right: Inspector -->
-  <aside class="inspector" x-show="selectedRunId && inspector.documentId">
+  <aside class="inspector" x-show="activeView === 'runs' && selectedRunId && inspector.documentId">
     <header class="inspector-header">
       <div>
         <p class="eyebrow">Inspector</p>
@@ -382,7 +468,7 @@ <h3 x-text="inspector.document?.filename || ''"></h3>
     </div>
   </aside>
 
-  <aside class="inspector inspector-empty" x-show="selectedRunId && !inspector.documentId">
+  <aside class="inspector inspector-empty" x-show="activeView === 'runs' && selectedRunId && !inspector.documentId">
     <p class="empty-note">Select a document to inspect.</p>
   </aside>
 
diff --git a/ocr_pipeline/static/js/app.js b/ocr_pipeline/static/js/app.js
index 5ad9dae..908231e 100644
--- a/ocr_pipeline/static/js/app.js
+++ b/ocr_pipeline/static/js/app.js
@@ -21,13 +21,78 @@ function emptyInspector() {
 
 function opencrApp() {
   return {
+    activeView: 'documents',
     version: '',
     healthStatus: 'checking...',
     healthClass: '',
+    currentModel: 'deepseek-ai/DeepSeek-OCR-2',
     metrics: {},
 
     auth: { enabled: false, authenticated: false, user: null },
 
+    extractionProfiles: [
+      {
+        id: 'latinized_ottoman_careful',
+        role: 'base profile',
+        status: 'default',
+        engine: 'vLLM',
+        model: 'deepseek-ai/DeepSeek-OCR-2',
+        dpi: '300 / 400',
+        prompt: 'Free OCR or grounded markdown',
+        crop: 'benchmark on/off',
+        cleanup: 'conservative',
+      },
+      {
+        id: 'ottoman_arabic_layout',
+        role: 'candidate',
+        status: 'benchmark',
+        engine: 'vLLM + layout detector',
+        model: 'deepseek-ai/DeepSeek-OCR-2',
+        dpi: '400',
+        prompt: 'grounded markdown',
+        crop: 'on',
+        cleanup: 'conservative',
+      },
+      {
+        id: 'tesseract_turkish_baseline',
+        role: 'baseline',
+        status: 'baseline',
+        engine: 'Tesseract LSTM',
+        model: 'tur',
+        dpi: '300',
+        prompt: 'n/a',
+        crop: 'off',
+        cleanup: 'minimal',
+      },
+    ],
+
+    benchmarkRows: [
+      {
+        documentType: 'Latinized Ottoman',
+        profile: 'latinized_ottoman_careful',
+        cer: 'pending',
+        wer: 'pending',
+        decision: 'measure',
+        best: false,
+      },
+      {
+        documentType: 'Ottoman Arabic print',
+        profile: 'ottoman_arabic_layout',
+        cer: 'pending',
+        wer: 'pending',
+        decision: 'measure',
+        best: false,
+      },
+      {
+        documentType: 'Modern Turkish print',
+        profile: 'tesseract_turkish_baseline',
+        cer: 'pending',
+        wer: 'pending',
+        decision: 'baseline',
+        best: false,
+      },
+    ],
+
     runs: [],
     selectedRunId: null,
     selectedRun: null,
@@ -96,9 +161,8 @@ function opencrApp() {
         const data = await API.health();
         this.version = data.pipeline_version || '';
         this.healthStatus = data.model_status || data.status;
-        this.healthClass = data.status === 'ready'
-          ? (data.local_model_cached === false ? 'waiting' : 'ready')
-          : 'waiting';
+        this.currentModel = data.model_name || this.currentModel;
+        this.healthClass = data.status === 'ready' ? 'ready' : 'waiting';
       } catch {
         this.healthStatus = 'offline';
         this.healthClass = 'error';
@@ -137,12 +201,14 @@ function opencrApp() {
     async selectRun(runId) {
       this._stream.disconnect();
       if (!runId) {
+        this.activeView = 'documents';
         this.selectedRunId = null;
         this.selectedRun = null;
         this.selectedRunDocumentIds = [];
         this.inspector = emptyInspector();
         return;
       }
+      this.activeView = 'runs';
       this.selectedRunId = runId;
       try {
         this.selectedRun = await API.getRun(runId);
@@ -271,6 +337,15 @@ function opencrApp() {
 
     pageStatusClass(status) { return PAGE_STATUS[status] || 'page-pending'; },
     runStatusClass(status) { return STATUS_PILL[status] || 'pill-muted'; },
+    profileStatusClass(status) {
+      if (status === 'default') return 'pill-success';
+      if (status === 'benchmark') return 'pill-warn';
+      return 'pill-muted';
+    },
+
+    setActiveView(view) {
+      this.activeView = view;
+    },
 
     documentProcessLabel(doc) {
       const status = doc.latest_run_status;
diff --git a/requirements-local.txt b/requirements-local.txt
deleted file mode 100644
index 738ece4..0000000
--- a/requirements-local.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# Optional dependencies for the in-process `local` model backend.
-# Only needed when MODEL_BACKEND=local. Install on top of the base requirements:
-#   pip install -r ocr_pipeline/requirements.txt -r requirements-local.txt
-#
-# `torch` here is the CPU/MPS build. On NVIDIA boxes prefer the official
-# CUDA-matched wheels from https://pytorch.org/ instead of letting pip pick.
-torch>=2.4.0
-torchvision>=0.19.0
-transformers==4.46.3
-tokenizers==0.20.3
-accelerate>=0.34.0
-einops>=0.8.0
-sentencepiece>=0.2.0
-addict>=2.4.0
-easydict>=1.13
diff --git a/scripts/run_batch.py b/scripts/run_batch.py
index 2119d8e..5890fd0 100644
--- a/scripts/run_batch.py
+++ b/scripts/run_batch.py
@@ -60,7 +60,7 @@ async def progress_callback(event: dict):
 
 async def main():
     parser = argparse.ArgumentParser(
-        description="DeepSeek-OCR batch PDF extraction",
+        description="DeepSeek-OCR-2 batch PDF extraction",
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
     parser.add_argument(
diff --git a/scripts/start.sh b/scripts/start.sh
index 7bf3842..55c28da 100755
--- a/scripts/start.sh
+++ b/scripts/start.sh
@@ -1,13 +1,12 @@
 #!/usr/bin/env bash
-# OpenCR — local dev launcher.
+# OpenCR — GPU-first dev launcher.
 #
-# Defaults to the `local` backend if MODEL_BACKEND is unset and no
-# vLLM-style remote URL is reachable, so `./scripts/start.sh` from a
-# fresh clone Just Works on a Mac.
+# Defaults to a remote/OpenAI-compatible vLLM endpoint on localhost. For the
+# full GPU stack, prefer `docker compose up -d`.
 #
 # Override anything via env vars:
-#   MODEL_BACKEND=local|remote|vllm
-#   MODEL_SERVER_URL=https://your-endpoint
+#   MODEL_BACKEND=remote|vllm
+#   MODEL_SERVER_URL=http://localhost:39671
 #   MODEL_API_KEY=sk-...
 #   INPUT_DIR=./input  OUTPUT_DIR=./output
 #   PORT=39672
@@ -17,12 +16,9 @@ set -euo pipefail
 cd "$(dirname "$0")/.."
 
 if [[ -z "${MODEL_BACKEND:-}" ]]; then
-  if [[ -n "${MODEL_SERVER_URL:-}" ]]; then
-    export MODEL_BACKEND=remote
-  else
-    export MODEL_BACKEND=local
-  fi
+  export MODEL_BACKEND=remote
 fi
+export MODEL_SERVER_URL="${MODEL_SERVER_URL:-http://localhost:39671}"
 
 export INPUT_DIR="${INPUT_DIR:-$(pwd)/input}"
 export OUTPUT_DIR="${OUTPUT_DIR:-$(pwd)/output}"
diff --git a/tests/test_batch_processor.py b/tests/test_batch_processor.py
index 9b26881..6042c05 100644
--- a/tests/test_batch_processor.py
+++ b/tests/test_batch_processor.py
@@ -2,16 +2,7 @@
 from ocr_pipeline.services.batch_processor import BatchProcessor
 
 
-def test_local_backend_defaults_to_single_page_concurrency(monkeypatch):
-    monkeypatch.setattr(settings, "model_backend", "local")
-    monkeypatch.setattr(settings, "batch_concurrency", 8)
-
-    processor = BatchProcessor(db=object())
-
-    assert processor.page_concurrency == 1
-
-
-def test_remote_backend_keeps_configured_page_concurrency(monkeypatch):
+def test_gpu_backend_keeps_configured_page_concurrency(monkeypatch):
     monkeypatch.setattr(settings, "model_backend", "vllm")
     monkeypatch.setattr(settings, "batch_concurrency", 8)
 
@@ -20,8 +11,8 @@ def test_remote_backend_keeps_configured_page_concurrency(monkeypatch):
     assert processor.page_concurrency == 8
 
 
-def test_explicit_page_concurrency_overrides_local_default(monkeypatch):
-    monkeypatch.setattr(settings, "model_backend", "local")
+def test_explicit_page_concurrency_overrides_gpu_default(monkeypatch):
+    monkeypatch.setattr(settings, "model_backend", "vllm")
     monkeypatch.setattr(settings, "batch_concurrency", 8)
 
     processor = BatchProcessor(db=object(), page_concurrency=2)
diff --git a/tests/test_gpu_first_runtime.py b/tests/test_gpu_first_runtime.py
new file mode 100644
index 0000000..d0c6192
--- /dev/null
+++ b/tests/test_gpu_first_runtime.py
@@ -0,0 +1,33 @@
+from pathlib import Path
+from typing import get_args
+
+from ocr_pipeline.config import Settings, settings
+from ocr_pipeline.models.schemas import HealthResponse
+from ocr_pipeline.services.startup import ModelReadiness
+
+
+def test_default_runtime_is_gpu_first_deepseek_ocr2():
+    assert settings.model_name == "deepseek-ai/DeepSeek-OCR-2"
+    assert set(get_args(Settings.model_fields["model_backend"].annotation)) == {
+        "vllm",
+        "remote",
+    }
+    assert not hasattr(settings, "local_device")
+
+
+def test_health_schema_does_not_expose_local_model_cache_state():
+    assert "local_model_cached" not in HealthResponse.model_fields
+    assert "local_model_cache_dir" not in HealthResponse.model_fields
+
+
+def test_model_readiness_tracks_only_remote_server_state():
+    readiness = ModelReadiness()
+
+    assert not hasattr(readiness, "local_model_cached")
+    assert not hasattr(readiness, "local_model_cache_dir")
+
+
+def test_local_backend_dependency_file_is_removed():
+    repo_root = Path(__file__).parents[1]
+
+    assert not (repo_root / "requirements-local.txt").exists()
diff --git a/tests/test_local_ocr_engine.py b/tests/test_local_ocr_engine.py
deleted file mode 100644
index 2d5e097..0000000
--- a/tests/test_local_ocr_engine.py
+++ /dev/null
@@ -1,127 +0,0 @@
-import asyncio
-import logging
-import warnings
-
-import pytest
-from PIL import Image
-
-from ocr_pipeline.services.local_ocr_engine import (
-    LocalOCREngine,
-    _resolve_attn_implementation,
-)
-
-
-def test_local_engine_caches_load_failure(monkeypatch):
-    async def _scenario():
-        LocalOCREngine._instance = None
-        engine = LocalOCREngine()
-        calls = 0
-
-        def fail_load():
-            nonlocal calls
-            calls += 1
-            raise RuntimeError("missing dependency")
-
-        monkeypatch.setattr(engine, "_load_blocking", fail_load)
-
-        for _ in range(2):
-            try:
-                await engine._ensure_loaded()
-            except RuntimeError:
-                pass
-            else:
-                raise AssertionError("expected load failure")
-
-        assert calls == 1
-
-    asyncio.run(_scenario())
-
-
-def test_local_attn_auto_uses_eager_when_flash_attn_missing(monkeypatch):
-    monkeypatch.setattr(
-        "ocr_pipeline.services.local_ocr_engine.find_spec", lambda _name: None
-    )
-
-    assert _resolve_attn_implementation("auto", "cuda") == "eager"
-
-
-def test_local_attn_auto_uses_flash_when_available_on_cuda(monkeypatch):
-    monkeypatch.setattr(
-        "ocr_pipeline.services.local_ocr_engine.find_spec", lambda _name: object()
-    )
-
-    assert _resolve_attn_implementation("auto", "cuda") == "flash_attention_2"
-
-
-def test_local_attn_forced_flash_requires_flash_attn(monkeypatch):
-    monkeypatch.setattr(
-        "ocr_pipeline.services.local_ocr_engine.find_spec", lambda _name: None
-    )
-
-    with pytest.raises(RuntimeError, match="requires `flash_attn`"):
-        _resolve_attn_implementation("flash_attention_2", "cuda")
-
-
-def test_local_infer_uses_eval_mode_so_text_is_returned(monkeypatch):
-    LocalOCREngine._instance = None
-    engine = LocalOCREngine()
-    engine._tokenizer = object()
-
-    calls = {}
-
-    class FakeModel:
-        def infer(self, tokenizer, **kwargs):
-            calls.update(kwargs)
-            return "recognized text"
-
-    engine._model = FakeModel()
-
-    result = engine._infer_blocking(Image.new("RGB", (8, 8)), "<image>\nFree OCR.")
-
-    assert result == "recognized text"
-    assert calls["eval_mode"] is True
-    assert calls["save_results"] is False
-
-
-def test_local_infer_suppresses_remote_model_stdout(capsys):
-    LocalOCREngine._instance = None
-    engine = LocalOCREngine()
-    engine._tokenizer = object()
-
-    class FakeModel:
-        def infer(self, tokenizer, **kwargs):
-            print("remote model debug noise")
-            return "recognized text"
-
-    engine._model = FakeModel()
-
-    result = engine._infer_blocking(Image.new("RGB", (8, 8)), "<image>\nFree OCR.")
-
-    assert result == "recognized text"
-    assert "remote model debug noise" not in capsys.readouterr().out
-
-
-def test_local_infer_suppresses_repeated_generation_noise(capsys, caplog):
-    LocalOCREngine._instance = None
-    engine = LocalOCREngine()
-    engine._tokenizer = object()
-
-    class FakeModel:
-        def infer(self, tokenizer, **kwargs):
-            warnings.warn(
-                "`do_sample` is set to `False`. However, `temperature` is set to `0.0`",
-                stacklevel=1,
-            )
-            logging.getLogger("transformers.generation.utils").warning(
-                "The attention mask and the pad token id were not set."
-            )
-            return "recognized text"
-
-    engine._model = FakeModel()
-
-    with caplog.at_level(logging.WARNING):
-        result = engine._infer_blocking(Image.new("RGB", (8, 8)), "<image>\nFree OCR.")
-
-    assert result == "recognized text"
-    assert "temperature" not in capsys.readouterr().err
-    assert "attention mask" not in caplog.text
diff --git a/tests/test_ocr_pair_exporter.py b/tests/test_ocr_pair_exporter.py
index c641a9f..aa69c3d 100644
--- a/tests/test_ocr_pair_exporter.py
+++ b/tests/test_ocr_pair_exporter.py
@@ -38,7 +38,7 @@ def test_ocr_pair_export_writes_images_jsonl_and_manifest(tmp_path):
     result = exporter.export_run(
         run={
             "id": "run-1234",
-            "model_used": "deepseek-ai/DeepSeek-OCR",
+            "model_used": "deepseek-ai/DeepSeek-OCR-2",
             "pipeline_version": "2.0.0",
         },
         documents=[
diff --git a/tests/test_output_writer.py b/tests/test_output_writer.py
index 86a955a..bf77f4b 100644
--- a/tests/test_output_writer.py
+++ b/tests/test_output_writer.py
@@ -87,7 +87,7 @@ def build_document() -> DocumentMetadata:
         started_at="2026-01-01T00:00:00+00:00",
         completed_at="2026-01-01T00:00:01+00:00",
         pipeline_version="2.0.0",
-        model_used="deepseek-ai/DeepSeek-OCR",
+        model_used="deepseek-ai/DeepSeek-OCR-2",
         pages=[build_page(1), build_page(2)],
     )
 
diff --git a/tests/test_requirements.py b/tests/test_requirements.py
deleted file mode 100644
index 707b8fa..0000000
--- a/tests/test_requirements.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from pathlib import Path
-
-
-def test_local_requirements_include_deepseek_remote_code_dependencies():
-    requirements = (Path(__file__).parents[1] / "requirements-local.txt").read_text(
-        encoding="utf-8"
-    )
-    assert "transformers==4.46.3" in requirements
-    assert "tokenizers==0.20.3" in requirements
-    assert "addict" in requirements
-    assert "easydict" in requirements
diff --git a/tests/test_startup.py b/tests/test_startup.py
deleted file mode 100644
index e55d0f7..0000000
--- a/tests/test_startup.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import asyncio
-
-from ocr_pipeline.services.startup import ModelReadiness, configure_local_readiness
-
-
-def test_local_readiness_reports_cached_model(monkeypatch, tmp_path):
-    readiness = ModelReadiness()
-
-    monkeypatch.setattr(
-        "ocr_pipeline.services.startup.try_to_load_from_cache",
-        lambda _repo_id, filename, **_kwargs: str(tmp_path / filename),
-    )
-
-    asyncio.run(configure_local_readiness(readiness))
-
-    assert readiness.ready is True
-    assert readiness.local_model_cached is True
-    assert "cached" in readiness.status
-
-
-def test_local_readiness_reports_download_needed(monkeypatch):
-    readiness = ModelReadiness()
-
-    monkeypatch.setattr(
-        "ocr_pipeline.services.startup.try_to_load_from_cache",
-        lambda *_args, **_kwargs: None,
-    )
-
-    asyncio.run(configure_local_readiness(readiness))
-
-    assert readiness.ready is True
-    assert readiness.local_model_cached is False
-    assert "will download on first extraction" in readiness.status
diff --git a/tests/test_ui_routes.py b/tests/test_ui_routes.py
index c9cd3bd..c75e77b 100644
--- a/tests/test_ui_routes.py
+++ b/tests/test_ui_routes.py
@@ -177,3 +177,17 @@ def test_home_uses_document_workbench():
     assert "currentPageQualityFlags()" in app_js
     assert "Quality flags" in html
     assert "saveSelectedDocument()" in app_js
+
+
+def test_home_exposes_profile_and_benchmark_views():
+    repo_root = Path(__file__).parents[1]
+    html = (repo_root / "ocr_pipeline/static/index.html").read_text(encoding="utf-8")
+    app_js = (repo_root / "ocr_pipeline/static/js/app.js").read_text(encoding="utf-8")
+
+    assert 'class="view-nav"' in html
+    assert "Profiles" in html
+    assert "Benchmarks" in html
+    assert "DeepSeek-OCR-2" in html
+    assert "activeView" in app_js
+    assert "extractionProfiles" in app_js
+    assert "benchmarkRows" in app_js

From b3ba4991425e775b077e156062ad8d25fee06421 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= <fatihburak@pm.me>
Date: Tue, 12 May 2026 02:48:39 +0300
Subject: [PATCH 15/15] html table debugged

---
 docker-compose.yml                    |  1 +
 ocr-model/Dockerfile                  |  2 +-
 ocr_pipeline/services/startup.py      |  2 +-
 ocr_pipeline/services/text_cleaner.py | 35 +++++++++++++++++++++++++++
 ocr_pipeline/static/js/app.js         | 33 +++++++++++++++++++++----
 tests/test_gpu_first_runtime.py       | 27 +++++++++++++++++++++
 tests/test_text_cleaner.py            | 24 ++++++++++++++++++
 7 files changed, 117 insertions(+), 7 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index fdf433f..509f8c2 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -5,6 +5,7 @@ services:
   ocr-model:
     build: ./ocr-model
     runtime: nvidia
+    ipc: host
     restart: unless-stopped
     environment:
       - NVIDIA_VISIBLE_DEVICES=all
diff --git a/ocr-model/Dockerfile b/ocr-model/Dockerfile
index f75ea02..809f827 100644
--- a/ocr-model/Dockerfile
+++ b/ocr-model/Dockerfile
@@ -1,4 +1,4 @@
-FROM vllm/vllm-openai:latest
+FROM vllm/vllm-openai:cu129-nightly
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 CMD [ \
     "--model", "deepseek-ai/DeepSeek-OCR-2", \
diff --git a/ocr_pipeline/services/startup.py b/ocr_pipeline/services/startup.py
index 1c43f81..527e2e2 100644
--- a/ocr_pipeline/services/startup.py
+++ b/ocr_pipeline/services/startup.py
@@ -53,7 +53,7 @@ async def wait_for_model_server() -> bool:
                     )
                     await asyncio.sleep(interval)
                     continue
-            except (httpx.ConnectError, httpx.ReadTimeout, httpx.ConnectTimeout) as exc:
+            except httpx.HTTPError as exc:
                 model_readiness.error = f"connection failed ({type(exc).__name__})"
                 logger.info(
                     "Model server not reachable yet (%s)", model_readiness.error
diff --git a/ocr_pipeline/services/text_cleaner.py b/ocr_pipeline/services/text_cleaner.py
index 0636323..65c1a38 100644
--- a/ocr_pipeline/services/text_cleaner.py
+++ b/ocr_pipeline/services/text_cleaner.py
@@ -1,3 +1,4 @@
+import html
 import re
 import unicodedata
 
@@ -43,6 +44,15 @@ class TextCleaner:
 
     # End-of-line soft hyphens: "word-\n" or "word- \n" followed by continuation
     _HYPHEN_RE = re.compile(r"(\w)- ?\n(\w)")
+    _TABLE_RE = re.compile(r"<table\b[^>]*>.*?</table>", re.IGNORECASE | re.DOTALL)
+    _ROW_RE = re.compile(r"<tr\b[^>]*>(.*?)</tr>", re.IGNORECASE | re.DOTALL)
+    _CELL_RE = re.compile(r"<t[dh]\b[^>]*>(.*?)</t[dh]>", re.IGNORECASE | re.DOTALL)
+    _BR_RE = re.compile(r"<br\s*/?>", re.IGNORECASE)
+    _PARA_END_RE = re.compile(r"</(?:p|div|h[1-6])\s*>", re.IGNORECASE)
+    _HTML_TAG_RE = re.compile(
+        r"</?(?:center|div|span|html|body|table|thead|tbody|tfoot|tr|td|th|p|br|h[1-6])\b[^>]*>",
+        re.IGNORECASE,
+    )
 
     def clean(self, text: str, strip_refs: bool = False) -> str:
         """Full cleaning pipeline."""
@@ -53,6 +63,7 @@ def clean(self, text: str, strip_refs: bool = False) -> str:
         text = self._strip_ref_blocks(text)
         text = self._strip_model_tokens(text)
         text = self._strip_artifacts(text)
+        text = self._html_to_text(text)
         text = self._rejoin_hyphens(text)
         text = self._normalize_whitespace(text)
         text = self._fix_common_ocr_issues(text)
@@ -96,6 +107,30 @@ def _strip_artifacts(self, text: str) -> str:
             text = pattern.sub("", text)
         return text
 
+    def _html_to_text(self, text: str) -> str:
+        """Convert occasional model-emitted HTML into readable plain text."""
+        text = self._TABLE_RE.sub(lambda match: self._table_to_text(match.group(0)), text)
+        text = self._BR_RE.sub("\n", text)
+        text = self._PARA_END_RE.sub("\n", text)
+        text = self._HTML_TAG_RE.sub("", text)
+        return html.unescape(text)
+
+    def _table_to_text(self, table: str) -> str:
+        rows: list[str] = []
+
+        for row_match in self._ROW_RE.finditer(table):
+            cells: list[str] = []
+            for cell_match in self._CELL_RE.finditer(row_match.group(1)):
+                cell = self._HTML_TAG_RE.sub("", cell_match.group(1))
+                cell = html.unescape(cell)
+                cell = re.sub(r"\s+", " ", cell).strip()
+                if cell:
+                    cells.append(cell)
+            if cells:
+                rows.append(" | ".join(cells))
+
+        return "\n".join(rows)
+
     def _normalize_whitespace(self, text: str) -> str:
         # Replace multiple blank lines with a single blank line
         text = re.sub(r"\n{3,}", "\n\n", text)
diff --git a/ocr_pipeline/static/js/app.js b/ocr_pipeline/static/js/app.js
index 908231e..db26e7d 100644
--- a/ocr_pipeline/static/js/app.js
+++ b/ocr_pipeline/static/js/app.js
@@ -120,6 +120,7 @@ function opencrApp() {
     toasts: [],
 
     _stream: new RunStream(),
+    _runPollTimer: null,
 
     async init() {
       await Promise.all([
@@ -194,12 +195,18 @@ function opencrApp() {
 
     async refreshSelectedRun() {
       if (!this.selectedRunId) return;
-      try { this.selectedRun = await API.getRun(this.selectedRunId); }
+      try {
+        this.selectedRun = await API.getRun(this.selectedRunId);
+        if (!['queued', 'processing'].includes(this.selectedRun.status)) {
+          this.stopRunPolling();
+        }
+      }
       catch (e) { this.toast(`Failed to refresh run: ${e.message}`, 'error'); }
     },
 
     async selectRun(runId) {
       this._stream.disconnect();
+      this.stopRunPolling();
       if (!runId) {
         this.activeView = 'documents';
         this.selectedRunId = null;
@@ -216,7 +223,10 @@ function opencrApp() {
         const firstCompleted = (this.selectedRun.documents || []).find(d => d.status === 'completed');
         if (firstCompleted) await this.openDocument(firstCompleted.document_id);
         else this.inspector = emptyInspector();
-        if (['queued', 'processing'].includes(this.selectedRun.status)) this.connectStream(runId);
+        if (['queued', 'processing'].includes(this.selectedRun.status)) {
+          this.connectStream(runId);
+          this.startRunPolling();
+        }
       } catch (e) {
         this.toast(`Failed to load run: ${e.message}`, 'error');
       }
@@ -224,9 +234,7 @@ function opencrApp() {
 
     connectStream(runId) {
       this._stream.connect(runId, async (event) => {
-        if (event.type === 'page_complete' || event.type === 'document_complete') {
-          await this.refreshSelectedRun();
-        }
+        await this.refreshSelectedRun();
         if (event.type === 'run_complete' || event.type === 'run_failed') {
           await Promise.all([this.refreshSelectedRun(), this.refreshRuns(), this.refreshMetrics()]);
           const completed = event.type === 'run_complete';
@@ -236,6 +244,21 @@ function opencrApp() {
       });
     },
 
+    startRunPolling() {
+      this.stopRunPolling();
+      this._runPollTimer = setInterval(async () => {
+        if (!this.selectedRunId) return this.stopRunPolling();
+        await Promise.all([this.refreshSelectedRun(), this.refreshRuns(), this.refreshMetrics()]);
+      }, 2000);
+    },
+
+    stopRunPolling() {
+      if (this._runPollTimer) {
+        clearInterval(this._runPollTimer);
+        this._runPollTimer = null;
+      }
+    },
+
     async openDocument(documentId) {
       if (!this.selectedRunId || !documentId) return;
       this.inspector.documentId = documentId;
diff --git a/tests/test_gpu_first_runtime.py b/tests/test_gpu_first_runtime.py
index d0c6192..b05f2cf 100644
--- a/tests/test_gpu_first_runtime.py
+++ b/tests/test_gpu_first_runtime.py
@@ -1,8 +1,12 @@
+import asyncio
 from pathlib import Path
 from typing import get_args
 
+import httpx
+
 from ocr_pipeline.config import Settings, settings
 from ocr_pipeline.models.schemas import HealthResponse
+from ocr_pipeline.services import startup
 from ocr_pipeline.services.startup import ModelReadiness
 
 
@@ -31,3 +35,26 @@ def test_local_backend_dependency_file_is_removed():
     repo_root = Path(__file__).parents[1]
 
     assert not (repo_root / "requirements-local.txt").exists()
+
+
+def test_model_readiness_treats_read_error_as_waiting(monkeypatch):
+    class DroppingAsyncClient:
+        def __init__(self, *args, **kwargs):
+            pass
+
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, *args):
+            return None
+
+        async def get(self, url):
+            raise httpx.ReadError("connection dropped")
+
+    monkeypatch.setattr(startup.httpx, "AsyncClient", DroppingAsyncClient)
+    monkeypatch.setattr(startup.settings, "model_ready_timeout", 0.001)
+    monkeypatch.setattr(startup.settings, "model_ready_interval", 0.001)
+    monkeypatch.setattr(startup, "model_readiness", ModelReadiness())
+
+    assert asyncio.run(startup.wait_for_model_server()) is False
+    assert startup.model_readiness.error == "connection failed (ReadError)"
diff --git a/tests/test_text_cleaner.py b/tests/test_text_cleaner.py
index aa2a3af..1cadfcd 100644
--- a/tests/test_text_cleaner.py
+++ b/tests/test_text_cleaner.py
@@ -74,6 +74,30 @@ def test_strips_null_bytes(self, cleaner):
         assert "HelloWorld" in result
 
 
+class TestHtmlCleanup:
+    def test_converts_html_table_to_plain_text_rows(self, cleaner):
+        text = (
+            "<table><tr><td>Köre almagan yiğittin,</td><td>Göremeyen yiğidin,</td></tr>"
+            "<tr><td>Kökiregi tüyilsin.</td><td>Göğsü duralsın.</td></tr></table>"
+        )
+
+        result = cleaner.clean(text)
+
+        assert result == (
+            "Köre almagan yiğittin, | Göremeyen yiğidin,\n"
+            "Kökiregi tüyilsin. | Göğsü duralsın."
+        )
+        assert "<table" not in result
+        assert "<td" not in result
+
+    def test_unescapes_html_entities(self, cleaner):
+        text = "&quot;Yüzü de ak dana, Şarifulla&#x27;nın giydiği"
+
+        result = cleaner.clean(text)
+
+        assert result == '"Yüzü de ak dana, Şarifulla\'nın giydiği'
+
+
 class TestWhitespaceNormalization:
     def test_multiple_blank_lines(self, cleaner):
         text = "Line 1\n\n\n\n\nLine 2"