diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..5690b06 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,39 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + + - name: Install system deps + run: sudo apt-get update && sudo apt-get install -y poppler-utils + + - name: Install Python deps + run: | + pip install -U pip + pip install -r ocr_pipeline/requirements.txt + pip install pytest pytest-asyncio ruff + + - name: Lint + run: ruff check ocr_pipeline tests scripts + + - name: Test + env: + PYTHONPATH: . + run: pytest -q diff --git a/.gitignore b/.gitignore index e3cb129..9a5fc59 100644 --- a/.gitignore +++ b/.gitignore @@ -3,14 +3,24 @@ __pycache__/ *$py.class *.so .env +.env.* +!.env.example .venv/ venv/ *.egg-info/ dist/ build/ .pytest_cache/ +.ruff_cache/ +.mypy_cache/ input/ output/ model-cache/ +hf-cache/ *.pdf !tests/fixtures/*.pdf +.DS_Store +**/.DS_Store +.idea/ +.vscode/ +.gemini/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..60d5daa --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,39 @@ +# Changelog + +All notable changes to OpenCR are documented here. The format follows [Keep a Changelog](https://keepachangelog.com/), and the project follows [Semantic Versioning](https://semver.org/). + +## [Unreleased] + +### Added +- Apache-2.0 license (`LICENSE`). +- English-first README with Turkish sibling at `README.tr.md`. +- `CONTRIBUTING.md`, GitHub Actions CI workflow, project `Makefile`. +- **Apple Silicon / CPU support.** New `MODEL_BACKEND=local` runs DeepSeek-OCR in-process via `transformers`, no GPU server required. Also adds `MODEL_BACKEND=remote` for any OpenAI-compatible endpoint. +- New CPU Docker profile and `Dockerfile.cpu`: `docker compose --profile cpu up -d`. +- `scripts/start.sh` one-command launcher with smart defaults; `requirements-local.txt` for the optional `transformers`/`torch` stack. +- **HuggingFace OAuth** ("Sign in with HuggingFace"). When configured, gates the publish action on a real HF login and uses the user's own write token. Falls back to paste-token mode when OAuth is unset. +- Publish modal now prefills `username/run-name` and adds the `opencr` discoverability tag to dataset cards. + +### Changed +- **Breaking:** `docker compose up` no longer starts services without an explicit profile. Use `--profile gpu` (vLLM, NVIDIA) or `--profile cpu` (in-process transformers). +- `INPUT_DIR` / `OUTPUT_DIR` default to `./input` / `./output` outside Docker, `/data/...` inside. +- OpenAPI metadata now declares Apache-2.0; UI footer no longer claims "All rights reserved". + +### Fixed +- `.gitignore` now covers `.DS_Store`, IDE folders, lint caches, and HF caches. + +--- + +## How to release + +1. Decide the next version (`MAJOR.MINOR.PATCH`). +2. Move the `[Unreleased]` block to a new `## [X.Y.Z] — YYYY-MM-DD` heading. +3. Bump `pipeline_version` in `ocr_pipeline/config.py` to match. +4. Commit: `git commit -am "release: vX.Y.Z"`. +5. Tag: `git tag vX.Y.Z && git push --tags`. +6. GitHub auto-creates a release page from the tag; paste the changelog entry into it. + +Bump rules: +- **PATCH** for bug fixes that don't change behavior. +- **MINOR** for backwards-compatible features. +- **MAJOR** for breaking changes (env var renames, removed endpoints, behavior shifts users have to adapt to). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..f429790 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,71 @@ +# Contributing to OpenCR + +Thanks for considering a contribution. + +OpenCR exists to make Turkish-language and archival OCR easy to run, +share, and improve — every PR, issue, and dataset published with OpenCR +helps that goal. + +## Ways to help + +- **File issues.** Found a bug, a layout that OpenCR struggles with, +or a confusing piece of docs? Open an issue with a short PDF +(or page screenshot) we can reproduce against. + +- **Add Turkish-language test fixtures.** A small public-domain PDF + the +expected text is one of the highest-leverage contributions. + +- **Benchmarks.** Compare OpenCR against Tesseract / Surya / +PaddleOCR / Marker on a Turkish corpus and post the table — even +informal numbers are useful. + +- **Model-backend ports.** MLX, llama.cpp, ONNX, or any other runtime +that improves throughput on a target platform. + +- **Translations.** README and dataset cards in additional languages. + +## Setup + +```bash +git clone https://github.com/cdliai/opencr.git +cd opencr +make install +make test +``` + +`make run` starts a local dev server on http://localhost:39672 with the `local` model backend (no GPU needed; ~5–30 s/page on M-series Macs). + +## Code style + +- Python: keep it boring and explicit. Type hints on public functions. No new dependencies without a brief rationale in the PR. + +- Frontend: stays Alpine + plain CSS until the state model genuinely outgrows it. No build step, no framework rewrite. + +- Tests: every new code path should have a unit or integration test. We use `pytest` and `pytest-asyncio`. + +## Pull request flow + +1. Open an issue first for non-trivial changes — a 5-line discussion saves a 500-line rewrite. + +2. Branch from `main`, name it `feat/...` or `fix/...`. + +3. Run `make lint test` before pushing. + +4. PR description: what changed, what it fixes, how to verify locally. + +## Reporting OCR-quality regressions + +If a particular PDF regresses after a change, please attach +(or link to a public copy of) the PDF, the page number, what +OpenCR produced, and what was expected. Quality bugs without +a reproducer are very hard to act on. + +## Code of conduct + +Be respectful. We're a small project trying to do useful work for +Turkish-language NLP — no room for harassment or +discrimination here. + +## License + +By submitting a PR, you agree your contribution is licensed under the project's [Apache 2.0 License](./LICENSE). diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..976006c --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for describing the origin of the Work and + reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Support. While redistributing the Work or + Derivative Works thereof, You may choose to offer, and charge a + fee for, acceptance of support, warranty, indemnity, or other + liability obligations and/or rights consistent with this License. + However, in accepting such obligations, You may act only on Your + own behalf and on Your sole responsibility, not on behalf of any + other Contributor, and only if You agree to indemnify, defend, + and hold each Contributor harmless for any liability incurred by, + or claims asserted against, such Contributor by reason of your + accepting any such warranty or support. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2025 cdli.ai and OpenCR contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied. See the License for the specific language governing permissions + and limitations under the License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7455fb2 --- /dev/null +++ b/Makefile @@ -0,0 +1,49 @@ +.PHONY: help install install-local run run-remote test lint format docker-up docker-down clean + +PY ?= python3 +VENV ?= .venv +PIP := $(VENV)/bin/pip +PYBIN := $(VENV)/bin/python + +help: + @echo "OpenCR developer targets:" + @echo " make install # venv + base deps + local-backend deps (Mac/CPU friendly)" + @echo " make run # start dev server on http://localhost:39672 with the local backend" + @echo " make run-remote # start dev server pointing at MODEL_SERVER_URL" + @echo " make test # run pytest suite" + @echo " make lint # ruff check" + @echo " make format # ruff format" + @echo " make docker-up # docker compose up (NVIDIA GPU profile)" + @echo " make docker-down # docker compose down" + +$(VENV): + $(PY) -m venv $(VENV) + $(PIP) install -U pip + +install: $(VENV) + $(PIP) install -r ocr_pipeline/requirements.txt -r requirements-local.txt + $(PIP) install pytest pytest-asyncio ruff + +run: $(VENV) + MODEL_BACKEND=local $(PYBIN) -m uvicorn ocr_pipeline.main:app --host 0.0.0.0 --port 39672 --reload + +run-remote: $(VENV) + MODEL_BACKEND=remote $(PYBIN) -m uvicorn ocr_pipeline.main:app --host 0.0.0.0 --port 39672 --reload + +test: $(VENV) + PYTHONPATH=. $(PYBIN) -m pytest -q + +lint: $(VENV) + $(VENV)/bin/ruff check ocr_pipeline tests scripts + +format: $(VENV) + $(VENV)/bin/ruff format ocr_pipeline tests scripts + +docker-up: + docker compose --profile gpu up -d + +docker-down: + docker compose down + +clean: + rm -rf $(VENV) .pytest_cache .ruff_cache **/__pycache__ diff --git a/README.md b/README.md index 7c7ef0d..cc85549 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,160 @@ -# OpenCR: Türkçe ve Karmaşık Dökümanlar İçin Yüksek Performanslı OCR Hattı +# OpenCR -OpenCR, özellikle Türkçe metinler, arşiv dökümanları ve karmaşık sayfa yapısına sahip PDF'leri, yapay zeka eğitimine hazır (HuggingFace-ready) tertemiz veri setlerine dönüştüren uçtan uca bir sistemdir. +> High-performance OCR pipeline for Turkish, archival, and complex-layout documents — turning PDFs into HuggingFace-ready training datasets. -## Neden OpenCR? +OpenCR is an end-to-end open-source pipeline that converts PDFs (especially Turkish text, archival material, and pages with complex layout) into clean Parquet datasets ready for LLM training and retrieval. -- **Türkçe Odaklı Doğruluk:** DeepSeek-OCR tabanlı yapısıyla, standart OCR araçlarının zorlandığı Türkçe karakterlerde ve karmaşık sayfa düzenlerinde üstün performans sağlar. -- **Veri Seti Fabrikası:** Çıkarılan metinleri doğrudan `.parquet` formatında paketler ve tek tıkla HuggingFace'e yüklemeye hazır hale getirir. -- **Operatör Konsolu:** İşlemleri izlemek, sayfa sayfa kontrol etmek ve hataları düzeltmek için modern bir web arayüzü sunar. +For Turkish documents, see: [README.tr.md](./README.tr.md) -## Kurulum +--- + +## Why OpenCR? + +- **Turkish-first accuracy.** Built around DeepSeek-OCR, it handles Turkish characters and difficult page layouts better than off-the-shelf OCR. +- **Dataset factory.** Outputs are packaged directly as `pages.parquet` + `documents.parquet` with deterministic train/validation/test splits and a HuggingFace dataset card. +- **Operator console.** A single-page web UI to monitor runs, page-by-page validate quality, retry, and publish to HuggingFace. +- **Pluggable backends.** Production-grade NVIDIA + vLLM by default; runs in-process on Apple Silicon / CPU for development; or talk to any OpenAI-compatible model server. + +--- + +## Quickstart + +### Option 1 — Docker (NVIDIA GPU, fastest path to inference) + +Requires Docker, an NVIDIA GPU, and the NVIDIA Container Toolkit. + +```bash +docker compose up -d +``` + +Open http://localhost:39672. Drop PDFs in `./input/`, hit **Start OCR run**. + +### Option 2 — Apple Silicon / CPU (in-process inference, no GPU needed) + +For local development, demos, and small jobs on a Mac or Linux box with no GPU. -### Docker ile Çalıştırma (GPU Gerekir) ```bash -docker-compose up -d +git clone https://github.com/cdliai/opencr.git +cd opencr +python3 -m venv .venv && source .venv/bin/activate +pip install -r ocr_pipeline/requirements.txt -r requirements-local.txt +MODEL_BACKEND=local ./scripts/start.sh ``` -### Lokal Geliştirme ve Web Arayüzü (Apple Silicon / CPU) -Pipeline arayüzünü Apple bilgisayarınızda veya CPU üzerinde denemek için: +Open http://localhost:39672. The DeepSeek-OCR model (~6 GB) downloads +on first request and runs in-process via `transformers` on MPS (Apple Silicon) +or CPU. Expect **5–30 seconds per page on M-series, much slower on CPU** — +fine for development, not for production batch jobs. -1. **Klasör ve Ortam Hazırlığı:** - ```bash - mkdir -p input output - python3 -m venv .venv - source .venv/bin/activate - pip install -r ocr_pipeline/requirements.txt - ``` +### Option 3 — Remote model server (point at any OpenAI-compatible endpoint) -2. **Başlatma:** - ```bash - export INPUT_DIR="./input" - export OUTPUT_DIR="./output" - export PYTHONPATH=$PYTHONPATH:. - python3 ocr_pipeline/main.py - ``` - Erişim: **http://localhost:39672** +If you already run vLLM somewhere, or use OpenRouter, or another endpoint +serving DeepSeek-OCR: -## Mimari -- **Backend:** vLLM tabanlı DeepSeek-OCR (Ağır iş yükü). -- **Frontend/API:** FastAPI & Alpine.js (Yönetim konsolu). +```bash +pip install -r ocr_pipeline/requirements.txt +MODEL_BACKEND=remote MODEL_SERVER_URL=https://your-endpoint MODEL_API_KEY=sk-... ./scripts/start.sh +``` + +--- + +## Configuration + +Configurable via environment variables (or a `.env` file): + +| Variable | Default | Description | +| -------------------- | -------------------------------- | ------------------------------------------------------------------------------------------------- | +| `MODEL_BACKEND` | `vllm` | `vllm` (NVIDIA, OpenAI-compatible server), `local` (in-process transformers), `remote` (alias). | +| `MODEL_SERVER_URL` | `http://ocr-model:39671` | Base URL for `vllm` / `remote` backends. | +| `MODEL_NAME` | `deepseek-ai/DeepSeek-OCR` | Model identifier. | +| `MODEL_API_KEY` | `EMPTY` | API key for remote endpoints. | +| `LOCAL_DEVICE` | auto | `auto`, `mps`, `cuda`, or `cpu` for the `local` backend. | +| `INPUT_DIR` | `./input` (or `/data/input`) | Where to read PDFs from. | +| `OUTPUT_DIR` | `./output` (or `/data/output`) | Where artifacts and the SQLite DB land. | +| `HOST` / `PORT` | `0.0.0.0` / `39672` | Where the web console serves. | +| `HF_OAUTH_CLIENT_ID` | unset | Enables "Sign in with HuggingFace" for the publish flow. See [HF OAuth setup](#hf-oauth-optional).| +| `APP_SESSION_SECRET` | random per process | Cookie-signing secret. Set to a stable value in production. | --- -*OpenCR, döküman arşivlerini dijitalleştirip modern yapay zeka dünyasına taşımak için [cdli.ai](https://cdli.ai) tarafından geliştirilmiştir.* + +## HuggingFace publishing + +Completed runs can be pushed to a HuggingFace dataset repo. Two modes: + +1. **Paste-token (default).** In the operator console, click **Publish to HuggingFace** and paste a HF write token. Or set `HF_TOKEN` in the server's environment to skip pasting. +2. **Sign in with HuggingFace (recommended for shared deployments).** Configure OAuth (below) and users sign in with their HF account. The publish flow then uses their personal token automatically. This is also how the operator console gets gated — without a session, the publish action is hidden. + +### HF OAuth (optional) + +1. Create an OAuth app at https://huggingface.co/settings/connected-applications/new with redirect URI `https://your-host/api/auth/callback` and scopes `openid profile write-repos`. +2. Set on the server: + +```bash +export HF_OAUTH_CLIENT_ID=... +export HF_OAUTH_CLIENT_SECRET=... +export HF_OAUTH_REDIRECT_URI=https://your-host/api/auth/callback +export APP_SESSION_SECRET=$(python3 -c 'import secrets; print(secrets.token_hex(32))') +``` + +3. Restart. The console gains a **Sign in with HuggingFace** button in the topbar. + +Published datasets are tagged `opencr` so they're discoverable via [HuggingFace's tag search](https://huggingface.co/datasets?other=opencr). + +--- + +## Architecture + +``` + ┌───────────────────────────────┐ + │ OCR pipeline (FastAPI) │ + PDFs ─────►. │ ingest → render → OCR → │ ──► pages.parquet + │ clean → validate → export │ documents.parquet + │ + operator console (Alpine) │ manifest.json + └──────────────┬────────────────┘ + │ OpenAI-compatible + ▼ + ┌───────────────────────────────┐ + │ Model backend │ + │ ┌─────────────────────────┐ │ + │ │ vllm (NVIDIA, prod) │ │ + │ │ local (MPS/CPU, dev) │ │ + │ │ remote (any OpenAI URL) │ │ + │ └─────────────────────────┘ │ + └───────────────────────────────┘ +``` + +State lives in SQLite + the filesystem. +No external queue/broker is required for single-node operation. +See [docs/architectural-overhaul-v2.md](./docs/architectural-overhaul-v2.md) for the long-form design. + +--- + +## Development + +```bash +make install # create venv, install deps +make run # start dev server with sensible defaults +make test # run pytest suite +make lint # ruff check +``` + +Tests live under `tests/`. UI is plain HTML + Alpine.js — no build step. + +--- + +## Contributing + +Contributions are welcome — bug reports, Turkish-language +test fixtures, benchmarks against other OCR engines, model-backend +ports (MLX, llama.cpp), and documentation translations are +especially useful. + +See [CONTRIBUTING.md](./CONTRIBUTING.md). + +--- + +## License + +Apache 2.0 — see [LICENSE](./LICENSE). + +OpenCR is built and maintained by [cdli.ai](https://cdli.ai) to support Turkish-language LLM research and dataset curation. diff --git a/README.tr.md b/README.tr.md new file mode 100644 index 0000000..7c7ef0d --- /dev/null +++ b/README.tr.md @@ -0,0 +1,43 @@ +# OpenCR: Türkçe ve Karmaşık Dökümanlar İçin Yüksek Performanslı OCR Hattı + +OpenCR, özellikle Türkçe metinler, arşiv dökümanları ve karmaşık sayfa yapısına sahip PDF'leri, yapay zeka eğitimine hazır (HuggingFace-ready) tertemiz veri setlerine dönüştüren uçtan uca bir sistemdir. + +## Neden OpenCR? + +- **Türkçe Odaklı Doğruluk:** DeepSeek-OCR tabanlı yapısıyla, standart OCR araçlarının zorlandığı Türkçe karakterlerde ve karmaşık sayfa düzenlerinde üstün performans sağlar. +- **Veri Seti Fabrikası:** Çıkarılan metinleri doğrudan `.parquet` formatında paketler ve tek tıkla HuggingFace'e yüklemeye hazır hale getirir. +- **Operatör Konsolu:** İşlemleri izlemek, sayfa sayfa kontrol etmek ve hataları düzeltmek için modern bir web arayüzü sunar. + +## Kurulum + +### Docker ile Çalıştırma (GPU Gerekir) +```bash +docker-compose up -d +``` + +### Lokal Geliştirme ve Web Arayüzü (Apple Silicon / CPU) +Pipeline arayüzünü Apple bilgisayarınızda veya CPU üzerinde denemek için: + +1. **Klasör ve Ortam Hazırlığı:** + ```bash + mkdir -p input output + python3 -m venv .venv + source .venv/bin/activate + pip install -r ocr_pipeline/requirements.txt + ``` + +2. **Başlatma:** + ```bash + export INPUT_DIR="./input" + export OUTPUT_DIR="./output" + export PYTHONPATH=$PYTHONPATH:. + python3 ocr_pipeline/main.py + ``` + Erişim: **http://localhost:39672** + +## Mimari +- **Backend:** vLLM tabanlı DeepSeek-OCR (Ağır iş yükü). +- **Frontend/API:** FastAPI & Alpine.js (Yönetim konsolu). + +--- +*OpenCR, döküman arşivlerini dijitalleştirip modern yapay zeka dünyasına taşımak için [cdli.ai](https://cdli.ai) tarafından geliştirilmiştir.* diff --git a/docker-compose.yml b/docker-compose.yml index e4a77ed..f5c3577 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,13 @@ +# Two profiles ship out of the box: +# +# docker compose --profile gpu up -d # production: vLLM model server + pipeline (NVIDIA) +# docker compose --profile cpu up -d # CPU/Mac: pipeline only, in-process transformers backend +# +# Without an explicit --profile, no services run. Always pick one. + services: ocr-model: + profiles: ["gpu"] build: ./ocr-model runtime: nvidia restart: unless-stopped @@ -31,6 +39,7 @@ services: start_period: 600s ocr-pipeline: + profiles: ["gpu"] build: ./ocr_pipeline restart: unless-stopped ports: @@ -39,6 +48,7 @@ services: - ./input:/data/input - ./output:/data/output environment: + - MODEL_BACKEND=vllm - MODEL_SERVER_URL=http://ocr-model:39671 - INPUT_DIR=/data/input - OUTPUT_DIR=/data/output @@ -46,5 +56,24 @@ services: ocr-model: condition: service_healthy + ocr-pipeline-cpu: + profiles: ["cpu"] + build: + context: . + dockerfile: ocr_pipeline/Dockerfile.cpu + restart: unless-stopped + ports: + - "39672:39672" + volumes: + - ./input:/data/input + - ./output:/data/output + - hf-cache:/root/.cache/huggingface + environment: + - MODEL_BACKEND=local + - LOCAL_DEVICE=cpu + - INPUT_DIR=/data/input + - OUTPUT_DIR=/data/output + - HF_HOME=/root/.cache/huggingface + volumes: hf-cache: diff --git a/ocr_pipeline/Dockerfile.cpu b/ocr_pipeline/Dockerfile.cpu new file mode 100644 index 0000000..7b1aedb --- /dev/null +++ b/ocr_pipeline/Dockerfile.cpu @@ -0,0 +1,34 @@ +# CPU-only image: pipeline + in-process transformers backend. +# No vLLM, no NVIDIA runtime needed. Builds on any host. +# +# Build context is the repo root (set by docker-compose.yml) so we can pull in +# requirements-local.txt alongside the pipeline source. +# +# The image is ~3 GB because it bundles torch + transformers; the model +# weights themselves (~6 GB) download on first request and cache to the +# hf-cache volume. +FROM python:3.12-slim + +RUN apt-get update && apt-get install -y --no-install-recommends \ + poppler-utils \ + git \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY ocr_pipeline/requirements.txt /tmp/requirements.txt +COPY requirements-local.txt /tmp/requirements-local.txt +# `--extra-index-url` pulls the CPU-only torch wheel. +RUN pip install --no-cache-dir \ + --extra-index-url https://download.pytorch.org/whl/cpu \ + -r /tmp/requirements.txt -r /tmp/requirements-local.txt + +COPY ocr_pipeline /app/ocr_pipeline/ + +ENV PYTHONPATH=/app +ENV MODEL_BACKEND=local +ENV LOCAL_DEVICE=cpu + +EXPOSE 39672 + +CMD ["uvicorn", "ocr_pipeline.main:app", "--host", "0.0.0.0", "--port", "39672"] diff --git a/ocr_pipeline/config.py b/ocr_pipeline/config.py index c0b5193..4e4a019 100644 --- a/ocr_pipeline/config.py +++ b/ocr_pipeline/config.py @@ -1,18 +1,42 @@ -from pydantic_settings import BaseSettings from pathlib import Path +from typing import Literal + +from pydantic_settings import BaseSettings + + +def _default_input_dir() -> Path: + """Use /data/input inside Docker, ./input on a developer machine.""" + if Path("/data").is_dir(): + return Path("/data/input") + return Path.cwd() / "input" + + +def _default_output_dir() -> Path: + if Path("/data").is_dir(): + return Path("/data/output") + return Path.cwd() / "output" class Settings(BaseSettings): - # Model server + # Model backend selection + # - "vllm" / "remote": call any OpenAI-compatible /v1/chat/completions server + # - "local" / "transformers": load DeepSeek-OCR in-process via transformers (Mac/CPU) + model_backend: Literal["vllm", "remote", "local", "transformers"] = "vllm" model_server_url: str = "http://ocr-model:39671" model_name: str = "deepseek-ai/DeepSeek-OCR" + model_api_key: str = "EMPTY" model_timeout: float = 120.0 - # Startup readiness - model_ready_timeout: int = 300 # Max seconds to wait for model server on startup - model_ready_interval: int = 5 # Seconds between readiness checks + # Local backend (Apple Silicon / CPU) + local_device: Literal["auto", "mps", "cuda", "cpu"] = "auto" + local_dtype: Literal["auto", "float16", "bfloat16", "float32"] = "auto" + local_model_cache: Path = Path.home() / ".cache" / "huggingface" + + # Startup readiness (used by the remote backend) + model_ready_timeout: int = 300 + model_ready_interval: int = 5 - # NGram processor defaults + # NGram processor defaults (vLLM-only feature; ignored by local backend) ngram_size: int = 30 window_size: int = 90 whitelist_token_ids: list[int] = [128821, 128822] # , @@ -26,11 +50,11 @@ class Settings(BaseSettings): # Retry settings max_retries: int = 2 - # Paths - input_dir: Path = Path("/data/input") - output_dir: Path = Path("/data/output") - runs_dir: Path = Path("/data/output/runs") - db_path: Path = Path("/data/output/opencr.sqlite") + # Paths — default to ./input, ./output on a dev machine; /data/* in Docker + input_dir: Path = _default_input_dir() + output_dir: Path = _default_output_dir() + runs_dir: Path = _default_output_dir() / "runs" + db_path: Path = _default_output_dir() / "opencr.sqlite" # Server host: str = "0.0.0.0" @@ -40,7 +64,21 @@ class Settings(BaseSettings): pipeline_version: str = "2.0.0" batch_concurrency: int = 8 - model_config = {"env_prefix": "", "case_sensitive": False} + # HuggingFace OAuth (optional — gates the publish UI when configured) + hf_oauth_client_id: str = "" + hf_oauth_client_secret: str = "" + hf_oauth_redirect_uri: str = "http://localhost:39672/api/auth/callback" + hf_oauth_scopes: str = "openid profile email write-repos" + + # Session signing + app_session_secret: str = "" + app_session_cookie: str = "opencr_session" + + model_config = {"env_prefix": "", "case_sensitive": False, "extra": "ignore"} + + @property + def is_local_backend(self) -> bool: + return self.model_backend in ("local", "transformers") settings = Settings() diff --git a/ocr_pipeline/main.py b/ocr_pipeline/main.py index 014f6f8..8a69d6f 100644 --- a/ocr_pipeline/main.py +++ b/ocr_pipeline/main.py @@ -1,13 +1,15 @@ import logging +import secrets from contextlib import asynccontextmanager from pathlib import Path from fastapi import FastAPI from fastapi.responses import FileResponse from fastapi.staticfiles import StaticFiles +from starlette.middleware.sessions import SessionMiddleware from ocr_pipeline.config import settings -from ocr_pipeline.routers import health, extract, jobs, metrics, runs, ui +from ocr_pipeline.routers import auth, health, extract, jobs, metrics, runs, ui from ocr_pipeline.services.db import init_database from ocr_pipeline.services.run_orchestrator import init_orchestrator from ocr_pipeline.services.run_storage import RunStorage @@ -53,11 +55,23 @@ async def lifespan(app: FastAPI): ), version=settings.pipeline_version, contact={"name": "cdli.ai", "url": "https://cdli.ai"}, - license_info={"name": "© cdli.ai — All rights reserved."}, + license_info={"name": "Apache-2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0"}, lifespan=lifespan, ) -for r in (health, extract, jobs, runs, metrics, ui): +# A stable APP_SESSION_SECRET keeps users signed in across server restarts; an +# empty value falls back to a per-process random secret which is fine for +# development. +_session_secret = settings.app_session_secret or secrets.token_hex(32) +app.add_middleware( + SessionMiddleware, + secret_key=_session_secret, + session_cookie=settings.app_session_cookie, + same_site="lax", + https_only=False, +) + +for r in (health, extract, jobs, runs, metrics, ui, auth): app.include_router(r.router) _static_dir = Path(__file__).parent / "static" diff --git a/ocr_pipeline/requirements.txt b/ocr_pipeline/requirements.txt index 6ef85ce..b7bddfa 100644 --- a/ocr_pipeline/requirements.txt +++ b/ocr_pipeline/requirements.txt @@ -12,3 +12,4 @@ httpx>=0.27.0 pyarrow>=18.0.0 aiosqlite>=0.20.0 huggingface-hub>=0.27.0 +itsdangerous>=2.2.0 diff --git a/ocr_pipeline/routers/auth.py b/ocr_pipeline/routers/auth.py new file mode 100644 index 0000000..ba6e2cd --- /dev/null +++ b/ocr_pipeline/routers/auth.py @@ -0,0 +1,83 @@ +"""HuggingFace OAuth routes. Mounted only when OAuth is configured. + +When OAuth is disabled the `/api/auth/me` endpoint still works and reports +`enabled: false` so the frontend can hide the sign-in button cleanly. +""" +from __future__ import annotations + +import logging + +from fastapi import APIRouter, HTTPException, Request +from fastapi.responses import RedirectResponse + +from ocr_pipeline.services.auth_session import ( + SESS_OAUTH_STATE, + build_authorize_url, + clear_session, + exchange_code, + is_oauth_enabled, + new_state, + session_user, + store_session, +) + +logger = logging.getLogger("ocr_pipeline.auth") +router = APIRouter() + + +@router.get("/api/auth/me") +async def auth_me(request: Request): + user = session_user(request.session) if hasattr(request, "session") else None + return { + "enabled": is_oauth_enabled(), + "authenticated": user is not None, + "user": ( + { + "name": user.name, + "picture": user.picture, + "profile": user.profile, + } + if user + else None + ), + } + + +@router.get("/api/auth/login") +async def auth_login(request: Request): + if not is_oauth_enabled(): + raise HTTPException(status_code=501, detail="HuggingFace OAuth is not configured.") + state = new_state() + request.session[SESS_OAUTH_STATE] = state + return RedirectResponse(build_authorize_url(state)) + + +@router.get("/api/auth/callback") +async def auth_callback(request: Request, code: str | None = None, state: str | None = None, error: str | None = None): + if error: + raise HTTPException(status_code=400, detail=f"HuggingFace returned error: {error}") + if not code: + raise HTTPException(status_code=400, detail="Missing authorization code.") + expected_state = request.session.pop(SESS_OAUTH_STATE, None) + if not expected_state or expected_state != state: + raise HTTPException(status_code=400, detail="OAuth state mismatch — request rejected.") + + try: + token_payload = await exchange_code(code) + except Exception as exc: + logger.exception("HF OAuth exchange failed") + raise HTTPException(status_code=502, detail=f"OAuth exchange failed: {exc}") + + store_session( + request.session, + access_token=token_payload["access_token"], + expires_in=token_payload.get("expires_in"), + userinfo=token_payload.get("userinfo") or {}, + ) + return RedirectResponse("/") + + +@router.post("/api/auth/logout") +async def auth_logout(request: Request): + clear_session(request.session) + return {"ok": True} diff --git a/ocr_pipeline/routers/runs.py b/ocr_pipeline/routers/runs.py index f70b56a..5f3e4df 100644 --- a/ocr_pipeline/routers/runs.py +++ b/ocr_pipeline/routers/runs.py @@ -3,7 +3,7 @@ from io import BytesIO from pathlib import Path -from fastapi import APIRouter, HTTPException, Path as PathParam, Query +from fastapi import APIRouter, HTTPException, Path as PathParam, Query, Request from fastapi.responses import FileResponse, PlainTextResponse, Response, StreamingResponse from ocr_pipeline.config import settings @@ -12,6 +12,7 @@ RunCreateResponse, RunDetail, RunDocumentDetail, RunDocumentSummary, RunSummary, StagedDocumentInfo, ) +from ocr_pipeline.services.auth_session import is_oauth_enabled, session_token, session_user from ocr_pipeline.services.db import get_db from ocr_pipeline.services.hf_publisher import publish_run_to_hf from ocr_pipeline.services.pdf_renderer import PDFRenderer @@ -282,22 +283,40 @@ async def gen(): @router.post("/api/runs/{run_id}/publish/hf", response_model=HFPublishResponse) -async def publish_to_hf(request: HFPublishRequest, run_id: str = ID): +async def publish_to_hf(payload: HFPublishRequest, http_request: Request, run_id: str = ID): db = get_db() run = await _require_run(run_id) if run["status"] != "completed": raise HTTPException(status_code=409, detail="Run is not yet completed") + # Token resolution order: + # 1. signed-in user's HF OAuth token (preferred — tied to a real user) + # 2. token explicitly passed in the request body (paste-token mode) + # 3. HF_TOKEN env var (single-user / dev fallback, resolved inside publisher) + user = session_user(http_request.session) if hasattr(http_request, "session") else None + sess_tok = session_token(http_request.session) if user else None + + # If OAuth is enabled and the user is signed in, ignore the body token — + # we don't want a session-bound request to hand off a different account's + # credentials. If OAuth is enabled and the user isn't signed in, refuse + # publishes entirely so the panel acts as a true gate. + if is_oauth_enabled(): + if not sess_tok: + raise HTTPException(status_code=401, detail="Sign in with HuggingFace to publish.") + token = sess_tok + else: + token = payload.token + documents = await db.list_run_documents(run_id) try: result = await publish_run_to_hf( run=run, documents=documents, dataset_dir=settings.runs_dir / run_id / "dataset", - repo_id=request.repo_id, - private=request.private, - token=request.token, - commit_message=request.commit_message, + repo_id=payload.repo_id, + private=payload.private, + token=token, + commit_message=payload.commit_message, ) except FileNotFoundError as exc: raise HTTPException(status_code=404, detail=str(exc)) diff --git a/ocr_pipeline/services/auth_session.py b/ocr_pipeline/services/auth_session.py new file mode 100644 index 0000000..a0482f0 --- /dev/null +++ b/ocr_pipeline/services/auth_session.py @@ -0,0 +1,140 @@ +"""HuggingFace OAuth helpers. + +Optional. When `HF_OAUTH_CLIENT_ID` is set, OpenCR enables a "Sign in with +HuggingFace" flow whose tokens drive the publish UI. When unset, the publish +flow falls back to the paste-token form (and `/api/auth/me` reports +`enabled: false`). + +Why OAuth and not just env-token-everywhere: +- Multi-user deployments shouldn't share one long-lived token. +- The token's repo permissions match the signed-in user, so users can only + push to repos they actually own. +- It's the basis for "gating" the panel — anonymous visitors see read-only. +""" +from __future__ import annotations + +import logging +import secrets +import time +from dataclasses import dataclass +from typing import Any + +import httpx + +from ocr_pipeline.config import settings + +logger = logging.getLogger("ocr_pipeline.auth") + +HF_AUTHORIZE_URL = "https://huggingface.co/oauth/authorize" +HF_TOKEN_URL = "https://huggingface.co/oauth/token" +HF_USERINFO_URL = "https://huggingface.co/oauth/userinfo" + +# Session cookie keys — kept short to fit the cookie size budget. +SESS_USER = "u" # dict: {name, picture, ...} +SESS_TOKEN = "t" # str: HF access token +SESS_EXPIRES_AT = "e" # float: epoch seconds +SESS_OAUTH_STATE = "s" # str: CSRF state during the redirect dance + + +@dataclass +class HFUser: + name: str + picture: str | None = None + profile: str | None = None + email: str | None = None + + +def is_oauth_enabled() -> bool: + return bool(settings.hf_oauth_client_id and settings.hf_oauth_client_secret) + + +def build_authorize_url(state: str) -> str: + from urllib.parse import urlencode + params = { + "client_id": settings.hf_oauth_client_id, + "redirect_uri": settings.hf_oauth_redirect_uri, + "response_type": "code", + "scope": settings.hf_oauth_scopes, + "state": state, + } + return f"{HF_AUTHORIZE_URL}?{urlencode(params)}" + + +def new_state() -> str: + return secrets.token_urlsafe(24) + + +async def exchange_code(code: str) -> dict[str, Any]: + """Trade an auth code for an access token + userinfo.""" + async with httpx.AsyncClient(timeout=15) as client: + token_resp = await client.post( + HF_TOKEN_URL, + data={ + "client_id": settings.hf_oauth_client_id, + "client_secret": settings.hf_oauth_client_secret, + "code": code, + "grant_type": "authorization_code", + "redirect_uri": settings.hf_oauth_redirect_uri, + }, + headers={"Accept": "application/json"}, + ) + token_resp.raise_for_status() + token_data = token_resp.json() + + access_token = token_data.get("access_token") + if not access_token: + raise RuntimeError("HF token endpoint returned no access_token") + + info_resp = await client.get( + HF_USERINFO_URL, + headers={"Authorization": f"Bearer {access_token}"}, + ) + info_resp.raise_for_status() + userinfo = info_resp.json() + + return { + "access_token": access_token, + "expires_in": token_data.get("expires_in"), + "userinfo": userinfo, + } + + +def store_session(session: dict, *, access_token: str, expires_in: int | None, userinfo: dict) -> None: + """Persist auth state on the request's session dict.""" + session[SESS_TOKEN] = access_token + session[SESS_USER] = { + "name": userinfo.get("preferred_username") or userinfo.get("name") or "anonymous", + "picture": userinfo.get("picture"), + "profile": userinfo.get("profile"), + "email": userinfo.get("email"), + "orgs": [o.get("name") for o in (userinfo.get("orgs") or []) if o.get("name")], + } + if expires_in: + session[SESS_EXPIRES_AT] = time.time() + int(expires_in) + + +def clear_session(session: dict) -> None: + for key in (SESS_USER, SESS_TOKEN, SESS_EXPIRES_AT, SESS_OAUTH_STATE): + session.pop(key, None) + + +def session_user(session: dict) -> HFUser | None: + if not session.get(SESS_TOKEN): + return None + expires_at = session.get(SESS_EXPIRES_AT) + if expires_at and time.time() > float(expires_at): + clear_session(session) + return None + user = session.get(SESS_USER) or {} + return HFUser( + name=user.get("name", "anonymous"), + picture=user.get("picture"), + profile=user.get("profile"), + email=user.get("email"), + ) + + +def session_token(session: dict) -> str | None: + if session_user(session) is None: + return None + return session.get(SESS_TOKEN) diff --git a/ocr_pipeline/services/hf_publisher.py b/ocr_pipeline/services/hf_publisher.py index 700281f..5be5609 100644 --- a/ocr_pipeline/services/hf_publisher.py +++ b/ocr_pipeline/services/hf_publisher.py @@ -38,7 +38,7 @@ def _build_dataset_card( "license: cc-by-4.0\n" "task_categories:\n - text-generation\n - feature-extraction\n" f"language:\n{yaml_lang_block}\n" - "tags:\n - ocr\n - opencr\n - cdli-ai\n - deepseek-ocr\n - pdf\n" + "tags:\n - ocr\n - opencr\n - cdliai\n - deepseek-ocr\n - pdf\n" f"size_categories:\n - {_size_bucket(totals['pages'])}\n" "---\n" ) diff --git a/ocr_pipeline/services/local_ocr_engine.py b/ocr_pipeline/services/local_ocr_engine.py new file mode 100644 index 0000000..2305b1b --- /dev/null +++ b/ocr_pipeline/services/local_ocr_engine.py @@ -0,0 +1,198 @@ +"""In-process OCR engine using HuggingFace `transformers`. + +Used by the `local` model backend so OpenCR runs on Apple Silicon, CPU-only +boxes, and any environment without a GPU model server. Trades throughput for +zero-deployment-friction: a single Python process boots the web UI and serves +inference. + +Caveats: +- DeepSeek-OCR is ~3B params + a vision tower; on M-series Macs expect + 5–30 s/page, on CPU much slower. Production batch jobs should use vLLM. +- The model loads lazily on the first extraction request so server startup + stays fast. +- `transformers` and `torch` are intentionally optional — they only get + imported when this module is instantiated. Install them via + `requirements-local.txt`. +""" +from __future__ import annotations + +import asyncio +import logging +import tempfile +from pathlib import Path +from typing import Any + +from PIL import Image + +from ocr_pipeline.config import settings + +logger = logging.getLogger("ocr_pipeline.local_engine") + +# Maps the same `mode` strings the remote backend uses to the prompt strings +# DeepSeek-OCR's reference inference helper expects. +LOCAL_PROMPTS = { + "markdown": "\n<|grounding|>Convert the document to markdown.", + "free_ocr": "\nFree OCR.", + "figure": "\nParse the figure.", +} + + +def _resolve_device(requested: str) -> str: + if requested != "auto": + return requested + try: + import torch + except ImportError as exc: + raise RuntimeError( + "MODEL_BACKEND=local requires `torch`. Install with: " + "pip install -r requirements-local.txt" + ) from exc + if torch.cuda.is_available(): + return "cuda" + if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available(): + return "mps" + return "cpu" + + +def _resolve_dtype(requested: str, device: str): + import torch + if requested == "float16": + return torch.float16 + if requested == "bfloat16": + return torch.bfloat16 + if requested == "float32": + return torch.float32 + # auto — bf16 on CUDA, fp16 on MPS, fp32 on CPU (mps doesn't love bf16, cpu hates fp16) + if device == "cuda": + return torch.bfloat16 + if device == "mps": + return torch.float16 + return torch.float32 + + +class LocalOCREngine: + """In-process DeepSeek-OCR inference via `transformers`. + + Only one instance is loaded per process; concurrent requests serialize on + the same model object via an asyncio lock since GPU/MPS memory makes + parallel calls impractical at this size. + """ + + _instance: "LocalOCREngine | None" = None + + def __new__(cls, *args, **kwargs): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__(self, model_name: str | None = None) -> None: + if getattr(self, "_initialized", False): + return + self.model_name = model_name or settings.model_name + self._model: Any = None + self._tokenizer: Any = None + self._device: str | None = None + self._dtype: Any = None + self._lock = asyncio.Lock() + self._initialized = True + + async def _ensure_loaded(self) -> None: + if self._model is not None: + return + async with self._lock: + if self._model is not None: + return + await asyncio.to_thread(self._load_blocking) + + def _load_blocking(self) -> None: + try: + import torch + from transformers import AutoModel, AutoTokenizer + except ImportError as exc: + raise RuntimeError( + "MODEL_BACKEND=local requires `transformers` and `torch`. " + "Install with: pip install -r requirements-local.txt" + ) from exc + + device = _resolve_device(settings.local_device) + dtype = _resolve_dtype(settings.local_dtype, device) + logger.info( + "Loading %s on %s (%s). First boot downloads ~6 GB.", + self.model_name, device, dtype, + ) + + # eager attention works everywhere; flash-attn-2 is CUDA-only and would + # break MPS/CPU loads. + attn_impl = "flash_attention_2" if device == "cuda" else "eager" + + tokenizer = AutoTokenizer.from_pretrained( + self.model_name, trust_remote_code=True, + cache_dir=str(settings.local_model_cache), + ) + model = AutoModel.from_pretrained( + self.model_name, + trust_remote_code=True, + use_safetensors=True, + attn_implementation=attn_impl, + cache_dir=str(settings.local_model_cache), + ) + model = model.eval().to(dtype) + if device != "cpu": + model = model.to(device) + + self._tokenizer = tokenizer + self._model = model + self._device = device + self._dtype = dtype + logger.info("Local OCR engine ready on %s", device) + + async def extract_page( + self, + image: Image.Image, + mode: str = "markdown", + ngram_size: int | None = None, # noqa: ARG002 (vLLM-only knob) + window_size: int | None = None, # noqa: ARG002 + ) -> str: + await self._ensure_loaded() + prompt = LOCAL_PROMPTS.get(mode, LOCAL_PROMPTS["markdown"]) + + async with self._lock: + return await asyncio.to_thread(self._infer_blocking, image, prompt) + + def _infer_blocking(self, image: Image.Image, prompt: str) -> str: + # DeepSeek-OCR's `model.infer` (registered via trust_remote_code) expects a + # path on disk for the image and writes its result alongside it. We feed it + # a temp dir so nothing leaks into the output volume. + with tempfile.TemporaryDirectory() as tmpdir: + tmp = Path(tmpdir) + image_path = tmp / "page.png" + image.save(image_path, format="PNG") + + try: + result = self._model.infer( + self._tokenizer, + prompt=prompt, + image_file=str(image_path), + output_path=str(tmp), + base_size=1024, + image_size=640, + crop_mode=True, + save_results=False, + test_compress=False, + ) + except TypeError: + # Older variants of the remote-code helper had a slightly + # different signature; fall back to the minimal kwargs. + result = self._model.infer( + self._tokenizer, + prompt=prompt, + image_file=str(image_path), + output_path=str(tmp), + ) + + if isinstance(result, str): + return result + # Some forks return a dict / list; prefer a 'text' key, else stringify. + if isinstance(result, dict) and "text" in result: + return str(result["text"]) + return str(result) if result is not None else "" diff --git a/ocr_pipeline/services/ocr_engine.py b/ocr_pipeline/services/ocr_engine.py index 0925146..494334d 100644 --- a/ocr_pipeline/services/ocr_engine.py +++ b/ocr_pipeline/services/ocr_engine.py @@ -1,5 +1,21 @@ +"""OCR engine abstraction. + +Two backends ship today: + +- `RemoteOCREngine` calls any OpenAI-compatible `/v1/chat/completions` endpoint + (vLLM serving DeepSeek-OCR is the production target; remote endpoints like + OpenRouter or a self-hosted shim work the same way). +- `LocalOCREngine` runs DeepSeek-OCR in-process via `transformers`. Slow but + needs no GPU server — used for Apple Silicon / CPU development. + +Pick a backend with the `MODEL_BACKEND` env var. `OCREngine()` returns the +right instance based on `settings.model_backend`. +""" +from __future__ import annotations + import base64 from io import BytesIO +from typing import Protocol from openai import AsyncOpenAI from PIL import Image @@ -13,24 +29,41 @@ } -class OCREngine: +class _OCREngineProtocol(Protocol): + async def extract_page( + self, + image: Image.Image, + mode: str = "markdown", + ngram_size: int | None = None, + window_size: int | None = None, + ) -> str: ... + + +def _image_to_base64(image: Image.Image) -> str: + buf = BytesIO() + image.save(buf, format="PNG") + return base64.b64encode(buf.getvalue()).decode("utf-8") + + +class RemoteOCREngine: + """Talks to any OpenAI-compatible vision endpoint.""" + def __init__( self, base_url: str | None = None, model_name: str | None = None, + api_key: str | None = None, ): self.model_name = model_name or settings.model_name self.client = AsyncOpenAI( - api_key="EMPTY", + api_key=api_key or settings.model_api_key or "EMPTY", base_url=f"{base_url or settings.model_server_url}/v1", timeout=settings.model_timeout, ) @staticmethod def image_to_base64(image: Image.Image) -> str: - buf = BytesIO() - image.save(buf, format="PNG") - return base64.b64encode(buf.getvalue()).decode("utf-8") + return _image_to_base64(image) async def extract_page( self, @@ -39,8 +72,19 @@ async def extract_page( ngram_size: int | None = None, window_size: int | None = None, ) -> str: - image_b64 = self.image_to_base64(image) prompt_text = PROMPTS.get(mode, PROMPTS["markdown"]) + image_b64 = _image_to_base64(image) + + # vLLM-specific knobs go in extra_body. Generic OpenAI servers will + # ignore unknown extra_body keys. + extra_body = { + "skip_special_tokens": False, + "vllm_xargs": { + "ngram_size": ngram_size or settings.ngram_size, + "window_size": window_size or settings.window_size, + "whitelist_token_ids": settings.whitelist_token_ids, + }, + } response = await self.client.chat.completions.create( model=self.model_name, @@ -48,32 +92,29 @@ async def extract_page( { "role": "user", "content": [ - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{image_b64}" - }, - }, - { - "type": "text", - "text": prompt_text, - }, + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}, + {"type": "text", "text": prompt_text}, ], } ], max_tokens=settings.max_tokens, temperature=settings.temperature, - extra_body={ - "skip_special_tokens": False, - "vllm_xargs": { - "ngram_size": ngram_size or settings.ngram_size, - "window_size": window_size or settings.window_size, - "whitelist_token_ids": settings.whitelist_token_ids, - }, - }, + extra_body=extra_body, ) content = response.choices[0].message.content if content is None: raise ValueError("No content returned from model") return content - \ No newline at end of file + + +def OCREngine(*args, **kwargs) -> _OCREngineProtocol: + """Factory. Returns the engine matching `settings.model_backend`. + + Existing callers do `OCREngine()` so we keep this name as a callable. + """ + if settings.is_local_backend: + # Imported lazily so projects without the local extras (transformers, + # torch) can still use the remote backend without import errors. + from ocr_pipeline.services.local_ocr_engine import LocalOCREngine + return LocalOCREngine(*args, **kwargs) + return RemoteOCREngine(*args, **kwargs) diff --git a/ocr_pipeline/services/startup.py b/ocr_pipeline/services/startup.py index 9f573c5..cc42c34 100644 --- a/ocr_pipeline/services/startup.py +++ b/ocr_pipeline/services/startup.py @@ -34,7 +34,18 @@ async def wait_for_model_server() -> bool: """ Block until the model server is healthy and can list its model. Called once at pipeline startup. Returns True if ready, False if timed out. + + For the in-process `local` backend there is nothing to wait for — the model + loads lazily on the first request — so we mark ready immediately. """ + if settings.is_local_backend: + model_readiness.ready = True + model_readiness.model_name = settings.model_name + model_readiness.error = None + model_readiness.checked_at = time.time() + logger.info("Local backend selected; model will load on first request.") + return True + base = settings.model_server_url timeout = settings.model_ready_timeout interval = settings.model_ready_interval diff --git a/ocr_pipeline/static/css/style.css b/ocr_pipeline/static/css/style.css index e7e5faf..2912a34 100644 --- a/ocr_pipeline/static/css/style.css +++ b/ocr_pipeline/static/css/style.css @@ -533,3 +533,9 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent); .run-summary { grid-template-columns: repeat(2, 1fr); } .toast-container { right: 10px; left: 10px; } } + +/* ---------------- auth cluster ---------------- */ +.auth-cluster { display: flex; align-items: center; gap: 10px; margin-left: 16px; } +.auth-user { display: flex; align-items: center; gap: 8px; } +.auth-avatar { width: 24px; height: 24px; border-radius: 50%; object-fit: cover; } +.auth-name { font-size: 0.82rem; font-weight: 600; color: var(--fg); } diff --git a/ocr_pipeline/static/index.html b/ocr_pipeline/static/index.html index 9a85fef..ef99aaa 100644 --- a/ocr_pipeline/static/index.html +++ b/ocr_pipeline/static/index.html @@ -33,6 +33,16 @@

OpenCR

+
+ +
+ + + +
+
@@ -195,9 +205,16 @@

Run

- +

+ Publishing requires sign-in. +

@@ -306,15 +323,18 @@

Publish to HuggingFace

Repo id -