From c0d57a95547380b58531bb95534db8a90472481d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= Date: Mon, 11 May 2026 20:20:50 +0300 Subject: [PATCH 01/15] feat: replace run summary cards with a terminal-style progress dashboard in the UI --- CHANGELOG.md | 6 +- ocr_pipeline/static/css/style.css | 94 ++++++++++++++++++++----------- ocr_pipeline/static/index.html | 51 +++++++---------- ocr_pipeline/static/js/app.js | 36 ++++++++++++ tests/test_ui_routes.py | 15 +++++ 5 files changed, 137 insertions(+), 65 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 60d5daa..1f29e9f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,9 +2,10 @@ All notable changes to OpenCR are documented here. The format follows [Keep a Changelog](https://keepachangelog.com/), and the project follows [Semantic Versioning](https://semver.org/). -## [Unreleased] +## [v1.0.0] ### Added + - Apache-2.0 license (`LICENSE`). - English-first README with Turkish sibling at `README.tr.md`. - `CONTRIBUTING.md`, GitHub Actions CI workflow, project `Makefile`. @@ -15,11 +16,13 @@ All notable changes to OpenCR are documented here. The format follows [Keep a Ch - Publish modal now prefills `username/run-name` and adds the `opencr` discoverability tag to dataset cards. ### Changed + - **Breaking:** `docker compose up` no longer starts services without an explicit profile. Use `--profile gpu` (vLLM, NVIDIA) or `--profile cpu` (in-process transformers). - `INPUT_DIR` / `OUTPUT_DIR` default to `./input` / `./output` outside Docker, `/data/...` inside. - OpenAPI metadata now declares Apache-2.0; UI footer no longer claims "All rights reserved". ### Fixed + - `.gitignore` now covers `.DS_Store`, IDE folders, lint caches, and HF caches. --- @@ -34,6 +37,7 @@ All notable changes to OpenCR are documented here. The format follows [Keep a Ch 6. GitHub auto-creates a release page from the tag; paste the changelog entry into it. Bump rules: + - **PATCH** for bug fixes that don't change behavior. - **MINOR** for backwards-compatible features. - **MAJOR** for breaking changes (env var renames, removed endpoints, behavior shifts users have to adapt to). diff --git a/ocr_pipeline/static/css/style.css b/ocr_pipeline/static/css/style.css index 2912a34..310d65c 100644 --- a/ocr_pipeline/static/css/style.css +++ b/ocr_pipeline/static/css/style.css @@ -307,29 +307,74 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent); .run-detail { display: flex; flex-direction: column; } -.run-summary { - display: grid; - grid-template-columns: repeat(4, 1fr); +.run-terminal { + margin: 16px 24px; + padding: 14px 16px; + border-radius: 8px; + background: #111827; + color: #d1d5db; + font-family: var(--font-mono); +} + +.run-terminal-head, +.run-terminal-status { + display: flex; + align-items: center; +} + +.run-terminal-head { + justify-content: space-between; gap: 12px; - padding: 16px 24px; } -.summary-card { - padding: 12px 14px; - border: 1px solid var(--border); - border-radius: var(--radius-sm); - background: rgba(255, 255, 255, 0.6); + +.run-terminal-status { gap: 8px; min-width: 0; } +.run-terminal-label { color: #f9fafb; font-weight: 700; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; } +.run-terminal code { color: #93c5fd; font: inherit; font-weight: 700; white-space: nowrap; } + +.run-spinner { + width: 10px; + height: 10px; + border: 2px solid rgba(209, 213, 219, 0.28); + border-top-color: #34d399; + border-radius: 50%; + animation: run-spin 0.8s linear infinite; } -.summary-label { font-size: 0.72rem; color: var(--muted); text-transform: uppercase; letter-spacing: 0.1em; } -.summary-card strong { display: block; margin-top: 4px; font-size: 1.05rem; font-weight: 700; } -.progress-track { +@keyframes run-spin { to { transform: rotate(360deg); } } + +.run-terminal-bar { height: 6px; - margin-top: 10px; + margin: 12px 0; border-radius: 999px; - background: rgba(31, 109, 85, 0.12); + background: #1f2937; overflow: hidden; } -.progress-fill { height: 100%; background: linear-gradient(90deg, #1f6d55, #2d8a6b); transition: width 0.25s ease; } + +.run-terminal-fill { + height: 100%; + border-radius: inherit; + background: #34d399; + transition: width 0.25s ease; +} + +.run-terminal-body { + display: grid; + gap: 5px; + font-size: 0.82rem; +} + +.run-terminal-body p { + display: flex; + gap: 10px; + margin: 0; + min-width: 0; +} + +.terminal-key { + width: 72px; + flex: 0 0 72px; + color: #9ca3af; +} .run-actions { display: flex; gap: 10px; padding: 0 24px 16px; } @@ -352,24 +397,6 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent); .doc-name { font-weight: 600; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; } .doc-meta { display: flex; gap: 6px; align-items: center; margin-top: 4px; font-size: 0.82rem; color: var(--muted); } -.heatmap { - display: flex; - flex-wrap: wrap; - gap: 2px; - margin-top: 10px; -} -.heat-cell { - width: 12px; - height: 12px; - border-radius: 2px; - background: rgba(115, 100, 82, 0.18); -} -.heat-cell.page-pass { background: var(--success); } -.heat-cell.page-warn { background: var(--warn); } -.heat-cell.page-fail { background: var(--error); } -.heat-cell.page-empty { background: rgba(115, 100, 82, 0.5); } -.heat-cell.page-pending { background: rgba(115, 100, 82, 0.18); } - /* ---------------- inspector ---------------- */ .inspector { padding: 0; } @@ -530,7 +557,6 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent); @media (max-width: 720px) { .topbar { flex-direction: column; align-items: stretch; gap: 10px; } - .run-summary { grid-template-columns: repeat(2, 1fr); } .toast-container { right: 10px; left: 10px; } } diff --git a/ocr_pipeline/static/index.html b/ocr_pipeline/static/index.html index ef99aaa..97481db 100644 --- a/ocr_pipeline/static/index.html +++ b/ocr_pipeline/static/index.html @@ -176,28 +176,30 @@

Run

-
-
- Progress - -
-
+
+
+
+ +
+
-
- Pages - - - / - +
+
-
- Stage - -
-
- Started - +
+

+ document + +

+

+ pages + +

+

+ stats + +

@@ -236,17 +238,6 @@

Documents

·
-
- -
diff --git a/ocr_pipeline/static/js/app.js b/ocr_pipeline/static/js/app.js index 2481859..2dbe4d5 100644 --- a/ocr_pipeline/static/js/app.js +++ b/ocr_pipeline/static/js/app.js @@ -187,6 +187,42 @@ function opencrApp() { return (this.inspector.document?.pages || []).find(p => p.page_num === pageNum)?.status || 'pending'; }, + currentRunDocument() { + const docs = this.selectedRun?.documents || []; + if (docs.length === 0) return null; + return docs.find(d => d.status === 'processing') + || docs.find(d => ['pending', 'queued'].includes(d.status)) + || docs.find(d => d.status === 'failed') + || docs[docs.length - 1]; + }, + + currentRunDocumentIndex() { + const docs = this.selectedRun?.documents || []; + const current = this.currentRunDocument(); + const index = current ? docs.findIndex(d => d.document_id === current.document_id) : -1; + return index === -1 ? 0 : index + 1; + }, + + runDocumentProgressLabel() { + const total = this.selectedRun?.documents_total || (this.selectedRun?.documents || []).length || 0; + return `[${this.currentRunDocumentIndex()}/${total}]`; + }, + + runProgressPercent() { + return Math.max(0, Math.min(100, Math.round((this.selectedRun?.progress || 0) * 100))); + }, + + runPageProgressLabel() { + return `${this.selectedRun?.pages_completed || 0}/${this.selectedRun?.pages_total || 0}`; + }, + + runStatsLabel() { + const docs = this.selectedRun?.documents || []; + const warn = docs.reduce((sum, d) => sum + (d.pages_warn || 0), 0); + const fail = docs.reduce((sum, d) => sum + (d.pages_fail || 0), 0); + return `${this.runProgressPercent()}% · ${warn} warn · ${fail} fail`; + }, + pageStatusClass(status) { return PAGE_STATUS[status] || 'page-pending'; }, runStatusClass(status) { return STATUS_PILL[status] || 'pill-muted'; }, diff --git a/tests/test_ui_routes.py b/tests/test_ui_routes.py index 1889184..d02f47f 100644 --- a/tests/test_ui_routes.py +++ b/tests/test_ui_routes.py @@ -1,4 +1,5 @@ from fastapi.testclient import TestClient +from pathlib import Path import ocr_pipeline.main as main_module from ocr_pipeline.config import settings @@ -59,3 +60,17 @@ async def fake_wait_for_model_server(): resp = client.get("/api/runs") assert resp.status_code == 200 assert resp.json() == [] + + +def test_run_detail_uses_minimal_terminal_progress(): + repo_root = Path(__file__).parents[1] + html = (repo_root / "ocr_pipeline/static/index.html").read_text(encoding="utf-8") + app_js = (repo_root / "ocr_pipeline/static/js/app.js").read_text(encoding="utf-8") + + assert 'class="run-terminal"' in html + assert 'x-text="runDocumentProgressLabel()"' in html + assert 'class="run-spinner"' in html + assert 'class="summary-card"' not in html + assert 'class="heatmap"' not in html + assert "currentRunDocument()" in app_js + assert "runDocumentProgressLabel()" in app_js From da34827287d46b34d381f1e39e550d8f08ea3898 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fatih=20Burak=20Karag=C3=B6z?= Date: Mon, 11 May 2026 20:35:47 +0300 Subject: [PATCH 02/15] feat: introduce document catalog and workbench for metadata management and PDF orchestration --- ocr_pipeline/main.py | 4 +- ocr_pipeline/models/schemas.py | 41 ++++++ ocr_pipeline/routers/documents.py | 50 +++++++ ocr_pipeline/routers/ui.py | 3 + ocr_pipeline/services/dataset_exporter.py | 41 +++++- ocr_pipeline/services/db.py | 136 +++++++++++++++++++ ocr_pipeline/services/document_catalog.py | 36 +++++ ocr_pipeline/services/run_orchestrator.py | 10 +- ocr_pipeline/static/css/style.css | 87 ++++++++++++ ocr_pipeline/static/index.html | 155 +++++++++++++--------- ocr_pipeline/static/js/api.js | 19 +++ ocr_pipeline/static/js/app.js | 83 +++++++++++- tests/test_dataset_exporter.py | 20 ++- tests/test_document_catalog.py | 43 ++++++ tests/test_ui_routes.py | 33 +++++ 15 files changed, 684 insertions(+), 77 deletions(-) create mode 100644 ocr_pipeline/routers/documents.py create mode 100644 ocr_pipeline/services/document_catalog.py create mode 100644 tests/test_document_catalog.py diff --git a/ocr_pipeline/main.py b/ocr_pipeline/main.py index 8a69d6f..0c91e17 100644 --- a/ocr_pipeline/main.py +++ b/ocr_pipeline/main.py @@ -9,7 +9,7 @@ from starlette.middleware.sessions import SessionMiddleware from ocr_pipeline.config import settings -from ocr_pipeline.routers import auth, health, extract, jobs, metrics, runs, ui +from ocr_pipeline.routers import auth, documents, health, extract, jobs, metrics, runs, ui from ocr_pipeline.services.db import init_database from ocr_pipeline.services.run_orchestrator import init_orchestrator from ocr_pipeline.services.run_storage import RunStorage @@ -71,7 +71,7 @@ async def lifespan(app: FastAPI): https_only=False, ) -for r in (health, extract, jobs, runs, metrics, ui, auth): +for r in (health, extract, jobs, runs, documents, metrics, ui, auth): app.include_router(r.router) _static_dir = Path(__file__).parent / "static" diff --git a/ocr_pipeline/models/schemas.py b/ocr_pipeline/models/schemas.py index 635cc3f..575e963 100644 --- a/ocr_pipeline/models/schemas.py +++ b/ocr_pipeline/models/schemas.py @@ -68,6 +68,47 @@ class FileInfo(BaseModel): path: str +class DocumentUpdate(BaseModel): + display_title: Optional[str] = None + author: Optional[str] = None + work: Optional[str] = None + book: Optional[str] = None + document_date_label: Optional[str] = None + document_date_precision: Optional[str] = None + language: Optional[str] = None + script: Optional[str] = None + license: Optional[str] = None + source_citation: Optional[str] = None + notes: Optional[str] = None + tags_json: Optional[str] = None + + +class DocumentSummary(BaseModel): + id: str + filename: str + display_title: str + source_path: str + file_sha256: str + file_size_bytes: int + total_pages: Optional[int] = None + pdf_title: Optional[str] = None + pdf_author: Optional[str] = None + author: Optional[str] = None + work: Optional[str] = None + book: Optional[str] = None + document_date_label: Optional[str] = None + document_date_precision: Optional[str] = None + language: Optional[str] = None + script: Optional[str] = None + license: Optional[str] = None + source_citation: Optional[str] = None + notes: Optional[str] = None + tags_json: Optional[str] = None + metadata_complete: bool = False + latest_run_id: Optional[str] = None + latest_run_status: Optional[str] = None + + class StagedDocumentInfo(BaseModel): document_id: str filename: str diff --git a/ocr_pipeline/routers/documents.py b/ocr_pipeline/routers/documents.py new file mode 100644 index 0000000..f26c8c5 --- /dev/null +++ b/ocr_pipeline/routers/documents.py @@ -0,0 +1,50 @@ +from fastapi import APIRouter, HTTPException, Path as PathParam, Query + +from ocr_pipeline.models.schemas import DocumentSummary, DocumentUpdate, RunSummary +from ocr_pipeline.routers.runs import _run_summary +from ocr_pipeline.services.db import get_db + + +router = APIRouter() + +ID = PathParam(..., pattern=r"^[A-Za-z0-9_\-]{1,64}$") + + +def _document_summary(row: dict) -> DocumentSummary: + data = dict(row) + data["display_title"] = data.get("display_title") or data.get("pdf_title") or data["filename"] + data["metadata_complete"] = bool(data.get("metadata_complete")) + return DocumentSummary(**data) + + +@router.get("/api/documents", response_model=list[DocumentSummary]) +async def list_documents(limit: int = Query(500, ge=1, le=1000)): + return [_document_summary(d) for d in await get_db().list_documents(limit=limit)] + + +@router.get("/api/documents/{document_id}", response_model=DocumentSummary) +async def get_document(document_id: str = ID): + doc = await get_db().get_document(document_id) + if not doc: + raise HTTPException(status_code=404, detail="Document not found") + listed = [d for d in await get_db().list_documents(limit=1000) if d["id"] == document_id] + return _document_summary(listed[0] if listed else doc) + + +@router.patch("/api/documents/{document_id}", response_model=DocumentSummary) +async def update_document(payload: DocumentUpdate, document_id: str = ID): + try: + await get_db().update_document_metadata( + document_id, + **payload.model_dump(exclude_unset=True), + ) + except KeyError: + raise HTTPException(status_code=404, detail="Document not found") + return await get_document(document_id) + + +@router.get("/api/documents/{document_id}/runs", response_model=list[RunSummary]) +async def list_document_runs(document_id: str = ID): + if not await get_db().get_document(document_id): + raise HTTPException(status_code=404, detail="Document not found") + return [_run_summary(r) for r in await get_db().list_document_runs(document_id)] diff --git a/ocr_pipeline/routers/ui.py b/ocr_pipeline/routers/ui.py index 451264e..2b9c19c 100644 --- a/ocr_pipeline/routers/ui.py +++ b/ocr_pipeline/routers/ui.py @@ -6,6 +6,8 @@ from ocr_pipeline.config import settings from ocr_pipeline.models.schemas import FileInfo +from ocr_pipeline.services.db import get_db +from ocr_pipeline.services.document_catalog import catalog_pdf router = APIRouter() @@ -25,6 +27,7 @@ async def upload_pdf(file: UploadFile): dest = settings.input_dir / safe_name content = await file.read() dest.write_bytes(content) + await catalog_pdf(get_db(), dest, filename=safe_name) return {"filename": safe_name, "size": len(content), "path": str(dest)} diff --git a/ocr_pipeline/services/dataset_exporter.py b/ocr_pipeline/services/dataset_exporter.py index 67394b4..fe7645e 100644 --- a/ocr_pipeline/services/dataset_exporter.py +++ b/ocr_pipeline/services/dataset_exporter.py @@ -1,7 +1,7 @@ import hashlib import json import zipfile -from dataclasses import asdict, dataclass +from dataclasses import asdict, dataclass, field from pathlib import Path import pyarrow as pa @@ -29,6 +29,7 @@ class DocumentExport: metadata: DocumentMetadata document_id: str artifact_paths: ArtifactPaths + catalog_metadata: dict = field(default_factory=dict) class DatasetExporter: @@ -53,6 +54,14 @@ def _split_name(stable_key: str) -> str: return "validation" return "test" + @staticmethod + def _language_list(value) -> list[str]: + if isinstance(value, list): + return [str(v).strip() for v in value if str(v).strip()] + if not value: + return [] + return [part.strip() for part in str(value).split(",") if part.strip()] + def export_run( self, run_id: str, @@ -66,6 +75,7 @@ def export_run( for entry in documents: doc_meta = entry.metadata + catalog = entry.catalog_metadata or {} paths = entry.artifact_paths raw_text = paths.raw_txt.read_text(encoding="utf-8") if paths.raw_txt.exists() else "" clean_text = paths.clean_txt.read_text(encoding="utf-8") if paths.clean_txt.exists() else "" @@ -83,6 +93,16 @@ def export_run( "run_id": run_id, "document_id": entry.document_id, "document_name": doc_meta.filename, + "title": catalog.get("display_title") or catalog.get("title") or doc_meta.pdf_title, + "author": catalog.get("author") or doc_meta.pdf_author, + "work": catalog.get("work"), + "book": catalog.get("book"), + "document_date_label": catalog.get("document_date_label"), + "document_date_precision": catalog.get("document_date_precision"), + "language": self._language_list(catalog.get("language")) or page_meta.detected_languages, + "script": catalog.get("script") or page_meta.primary_script, + "license": catalog.get("license"), + "source_citation": catalog.get("source_citation"), "page_number": page_meta.page_num, "source_pdf_sha256": doc_meta.file_sha256, "raw_text": page_raw_text, @@ -105,12 +125,23 @@ def export_run( ) document_rows.append( - { + { "dataset_export_id": export_id, "run_id": run_id, - "document_id": entry.document_id, - "document_name": doc_meta.filename, - "source_pdf_sha256": doc_meta.file_sha256, + "document_id": entry.document_id, + "document_name": doc_meta.filename, + "title": catalog.get("display_title") or catalog.get("title") or doc_meta.pdf_title, + "author": catalog.get("author") or doc_meta.pdf_author, + "work": catalog.get("work"), + "book": catalog.get("book"), + "document_date_label": catalog.get("document_date_label"), + "document_date_precision": catalog.get("document_date_precision"), + "language": self._language_list(catalog.get("language")) or doc_meta.languages_detected, + "script": catalog.get("script") or doc_meta.dominant_script, + "license": catalog.get("license"), + "source_citation": catalog.get("source_citation"), + "notes": catalog.get("notes"), + "source_pdf_sha256": doc_meta.file_sha256, "page_count": doc_meta.total_pages, "raw_text": raw_text, "clean_text": clean_text, diff --git a/ocr_pipeline/services/db.py b/ocr_pipeline/services/db.py index 9b6d3eb..d88b671 100644 --- a/ocr_pipeline/services/db.py +++ b/ocr_pipeline/services/db.py @@ -10,6 +10,38 @@ logger = logging.getLogger("ocr_pipeline.db") +DOCUMENT_METADATA_FIELDS = { + "display_title", + "author", + "work", + "book", + "document_date_label", + "document_date_precision", + "language", + "script", + "license", + "source_citation", + "notes", + "tags_json", +} + +DOCUMENT_METADATA_COLUMNS = { + "display_title": "TEXT", + "author": "TEXT", + "work": "TEXT", + "book": "TEXT", + "document_date_label": "TEXT", + "document_date_precision": "TEXT", + "language": "TEXT", + "script": "TEXT", + "license": "TEXT", + "source_citation": "TEXT", + "notes": "TEXT", + "tags_json": "TEXT", + "catalog_updated_at": "TEXT", +} + + SCHEMA = """ CREATE TABLE IF NOT EXISTS runs ( id TEXT PRIMARY KEY, @@ -43,6 +75,19 @@ pdf_author TEXT, pdf_creation_date TEXT, pdf_producer TEXT, + display_title TEXT, + author TEXT, + work TEXT, + book TEXT, + document_date_label TEXT, + document_date_precision TEXT, + language TEXT, + script TEXT, + license TEXT, + source_citation TEXT, + notes TEXT, + tags_json TEXT, + catalog_updated_at TEXT, first_seen_at TEXT NOT NULL, last_seen_at TEXT NOT NULL ); @@ -139,6 +184,7 @@ async def connect(self) -> None: await self._conn.execute("PRAGMA journal_mode=WAL;") await self._conn.execute("PRAGMA foreign_keys=ON;") await self._conn.executescript(SCHEMA) + await self._migrate() await self._conn.commit() logger.info("Database ready at %s", self.db_path) @@ -153,6 +199,14 @@ def conn(self) -> aiosqlite.Connection: raise RuntimeError("Database not connected; call connect() first.") return self._conn + async def _migrate(self) -> None: + """Apply additive migrations for existing local SQLite catalogs.""" + async with self.conn.execute("PRAGMA table_info(documents)") as cur: + existing = {row["name"] for row in await cur.fetchall()} + for name, column_type in DOCUMENT_METADATA_COLUMNS.items(): + if name not in existing: + await self.conn.execute(f"ALTER TABLE documents ADD COLUMN {name} {column_type}") + @asynccontextmanager async def cursor(self) -> AsyncIterator[aiosqlite.Cursor]: async with self.conn.cursor() as cur: @@ -269,6 +323,8 @@ async def upsert_document( ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(id) DO UPDATE SET filename = excluded.filename, + source_path = excluded.source_path, + file_size_bytes = excluded.file_size_bytes, last_seen_at = excluded.last_seen_at, total_pages = COALESCE(excluded.total_pages, documents.total_pages), pdf_title = COALESCE(excluded.pdf_title, documents.pdf_title), @@ -300,6 +356,86 @@ async def get_document(self, document_id: str) -> Optional[dict[str, Any]]: ) as cur: return _row_to_dict(await cur.fetchone()) + async def list_documents(self, limit: int = 500) -> list[dict[str, Any]]: + async with self.conn.execute( + """ + SELECT d.id, d.filename, d.source_path, d.file_sha256, d.file_size_bytes, + d.total_pages, d.pdf_title, d.pdf_author, d.pdf_creation_date, d.pdf_producer, + d.author, d.work, d.book, d.document_date_label, d.document_date_precision, + d.language, d.script, d.license, d.source_citation, d.notes, d.tags_json, + d.catalog_updated_at, d.first_seen_at, d.last_seen_at, + COALESCE(NULLIF(d.display_title, ''), NULLIF(d.pdf_title, ''), d.filename) + AS display_title, + CASE + WHEN COALESCE(d.author, '') != '' + AND COALESCE(d.work, '') != '' + AND COALESCE(d.document_date_label, '') != '' + AND COALESCE(d.document_date_precision, '') != '' + AND COALESCE(d.language, '') != '' + AND COALESCE(d.script, '') != '' + AND COALESCE(d.license, '') != '' + THEN 1 ELSE 0 + END AS metadata_complete, + ( + SELECT r.id + FROM run_documents rd + JOIN runs r ON r.id = rd.run_id + WHERE rd.document_id = d.id + ORDER BY r.created_at DESC + LIMIT 1 + ) AS latest_run_id, + ( + SELECT r.status + FROM run_documents rd + JOIN runs r ON r.id = rd.run_id + WHERE rd.document_id = d.id + ORDER BY r.created_at DESC + LIMIT 1 + ) AS latest_run_status + FROM documents d + ORDER BY d.last_seen_at DESC + LIMIT ? + """, + (limit,), + ) as cur: + rows = await cur.fetchall() + return [_row_to_dict(r) for r in rows] # type: ignore[misc] + + async def update_document_metadata(self, document_id: str, **fields: Any) -> dict[str, Any]: + clean = {k: v for k, v in fields.items() if k in DOCUMENT_METADATA_FIELDS} + if clean: + clean["catalog_updated_at"] = _now() + cols = ", ".join(f"{k} = ?" for k in clean) + values = [*clean.values(), document_id] + cur = await self.conn.execute( + f"UPDATE documents SET {cols} WHERE id = ?", + values, + ) + await self.conn.commit() + affected = cur.rowcount or 0 + await cur.close() + if not affected: + raise KeyError(document_id) + + doc = await self.get_document(document_id) + if not doc: + raise KeyError(document_id) + return doc + + async def list_document_runs(self, document_id: str) -> list[dict[str, Any]]: + async with self.conn.execute( + """ + SELECT r.*, rd.status AS document_status, rd.pages_pass, rd.pages_warn, rd.pages_fail + FROM run_documents rd + JOIN runs r ON r.id = rd.run_id + WHERE rd.document_id = ? + ORDER BY r.created_at DESC + """, + (document_id,), + ) as cur: + rows = await cur.fetchall() + return [_row_to_dict(r) for r in rows] # type: ignore[misc] + async def get_document_by_sha(self, file_sha256: str) -> Optional[dict[str, Any]]: async with self.conn.execute( "SELECT * FROM documents WHERE file_sha256 = ?", (file_sha256,) diff --git a/ocr_pipeline/services/document_catalog.py b/ocr_pipeline/services/document_catalog.py new file mode 100644 index 0000000..6cc94a3 --- /dev/null +++ b/ocr_pipeline/services/document_catalog.py @@ -0,0 +1,36 @@ +import asyncio +import hashlib +from pathlib import Path + +import fitz + +from ocr_pipeline.services.db import Database + + +def _hash_file_sync(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def _count_pages_sync(path: Path) -> int: + with fitz.open(str(path)) as doc: + return len(doc) + + +async def catalog_pdf(db: Database, path: Path, *, filename: str | None = None) -> dict: + sha = await asyncio.to_thread(_hash_file_sync, path) + try: + page_count = await asyncio.to_thread(_count_pages_sync, path) + except Exception: + page_count = 0 + return await db.upsert_document( + sha[:16], + filename=filename or path.name, + source_path=str(path), + file_sha256=sha, + file_size_bytes=(await asyncio.to_thread(path.stat)).st_size, + total_pages=page_count or None, + ) diff --git a/ocr_pipeline/services/run_orchestrator.py b/ocr_pipeline/services/run_orchestrator.py index 5561e04..f3e02cf 100644 --- a/ocr_pipeline/services/run_orchestrator.py +++ b/ocr_pipeline/services/run_orchestrator.py @@ -227,8 +227,14 @@ async def _maybe_export(self, run_id: str, documents_meta: list, export_parquet: return None await self.db.update_run(run_id, stage="exporting") await self._emit(run_id, "dataset_export_started", {}) - exports = [DocumentExport(metadata=m, document_id=did, artifact_paths=p) - for (did, p, m) in documents_meta] + exports = [] + for did, paths, meta in documents_meta: + exports.append(DocumentExport( + metadata=meta, + document_id=did, + artifact_paths=paths, + catalog_metadata=await self.db.get_document(did) or {}, + )) result = await asyncio.to_thread( DatasetExporter(self.storage.dataset_dir(run_id)).export_run, run_id, exports, diff --git a/ocr_pipeline/static/css/style.css b/ocr_pipeline/static/css/style.css index 310d65c..eb90fa6 100644 --- a/ocr_pipeline/static/css/style.css +++ b/ocr_pipeline/static/css/style.css @@ -113,6 +113,8 @@ a { color: var(--accent); } min-height: calc(100vh - 100px); } +.console-grid.document-mode { grid-template-columns: 280px minmax(0, 1fr); } + .rail, .stage, .inspector { border: 1px solid var(--border); border-radius: var(--radius); @@ -280,6 +282,86 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent); .intake-cta { display: flex; align-items: center; gap: 14px; margin-top: 6px; } +/* ---------------- document workbench ---------------- */ + +.document-workbench { display: flex; flex-direction: column; min-height: 100%; } + +.document-workbench-body { + display: grid; + grid-template-columns: minmax(0, 1fr) 340px; + gap: 14px; + padding: 16px 24px 24px; +} + +.document-library, +.document-editor { + border: 1px solid var(--border); + border-radius: var(--radius-sm); + background: rgba(255, 255, 255, 0.62); + overflow: hidden; +} + +.document-toolbar { + display: flex; + align-items: center; + gap: 12px; + min-height: 48px; + padding: 8px 12px; + border-bottom: 1px solid var(--border); +} +.document-toolbar.drag-over { background: var(--accent-soft); } + +.document-table { display: grid; } +.document-row { + display: grid; + grid-template-columns: 24px minmax(220px, 1fr) 86px 84px 78px; + align-items: center; + gap: 10px; + min-height: 52px; + padding: 8px 12px; + border-bottom: 1px solid var(--border); + cursor: pointer; +} +.document-row:last-child { border-bottom: none; } +.document-row:hover { background: rgba(255, 255, 255, 0.72); } +.document-row.active { background: var(--accent-soft); } +.document-row-head { + min-height: 34px; + cursor: default; + background: rgba(115, 100, 82, 0.08); + color: var(--muted); + font-size: 0.72rem; + font-weight: 700; + letter-spacing: 0.08em; + text-transform: uppercase; +} +.document-title { min-width: 0; display: grid; gap: 2px; } +.document-title strong, +.document-title span { overflow: hidden; text-overflow: ellipsis; white-space: nowrap; } +.document-title span { color: var(--muted); font-size: 0.78rem; } + +.document-editor-head { + padding: 14px 16px; + border-bottom: 1px solid var(--border); +} +.document-editor-head .eyebrow { + margin: 0 0 4px; + text-transform: uppercase; + letter-spacing: 0.12em; + color: var(--muted); + font-size: 0.66rem; + font-weight: 700; +} +.document-editor-head h3 { margin: 0; font-size: 1rem; } +.document-editor-fields { display: grid; gap: 10px; padding: 14px 16px; } +.field-row { display: grid; grid-template-columns: 1fr 1fr; gap: 10px; } +.document-editor-actions { + display: flex; + justify-content: flex-end; + padding: 12px 16px; + border-top: 1px solid var(--border); +} + /* ---------------- buttons ---------------- */ .btn { @@ -543,6 +625,7 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent); @media (max-width: 1280px) { .console-grid { grid-template-columns: 240px minmax(0, 1fr) 460px; } + .console-grid.document-mode { grid-template-columns: 240px minmax(0, 1fr); } } @media (max-width: 1080px) { @@ -553,10 +636,14 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent); .rail { max-height: 320px; } .inspector { max-height: 720px; } .metric-strip { display: none; } + .document-workbench-body { grid-template-columns: 1fr; } } @media (max-width: 720px) { .topbar { flex-direction: column; align-items: stretch; gap: 10px; } + .document-row { grid-template-columns: 24px minmax(160px, 1fr) 72px 76px; } + .document-row > :last-child { display: none; } + .field-row { grid-template-columns: 1fr; } .toast-container { right: 10px; left: 10px; } } diff --git a/ocr_pipeline/static/index.html b/ocr_pipeline/static/index.html index 97481db..242915b 100644 --- a/ocr_pipeline/static/index.html +++ b/ocr_pipeline/static/index.html @@ -46,7 +46,7 @@

OpenCR

-
+