diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..5690b06
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,39 @@
+name: CI
+
+on:
+ push:
+ branches: [main]
+ pull_request:
+ branches: [main]
+
+jobs:
+ test:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.11", "3.12"]
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: pip
+
+ - name: Install system deps
+ run: sudo apt-get update && sudo apt-get install -y poppler-utils
+
+ - name: Install Python deps
+ run: |
+ pip install -U pip
+ pip install -r ocr_pipeline/requirements.txt
+ pip install pytest pytest-asyncio ruff
+
+ - name: Lint
+ run: ruff check ocr_pipeline tests scripts
+
+ - name: Test
+ env:
+ PYTHONPATH: .
+ run: pytest -q
diff --git a/.gitignore b/.gitignore
index e3cb129..9a5fc59 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,14 +3,24 @@ __pycache__/
*$py.class
*.so
.env
+.env.*
+!.env.example
.venv/
venv/
*.egg-info/
dist/
build/
.pytest_cache/
+.ruff_cache/
+.mypy_cache/
input/
output/
model-cache/
+hf-cache/
*.pdf
!tests/fixtures/*.pdf
+.DS_Store
+**/.DS_Store
+.idea/
+.vscode/
+.gemini/
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..60d5daa
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,39 @@
+# Changelog
+
+All notable changes to OpenCR are documented here. The format follows [Keep a Changelog](https://keepachangelog.com/), and the project follows [Semantic Versioning](https://semver.org/).
+
+## [Unreleased]
+
+### Added
+- Apache-2.0 license (`LICENSE`).
+- English-first README with Turkish sibling at `README.tr.md`.
+- `CONTRIBUTING.md`, GitHub Actions CI workflow, project `Makefile`.
+- **Apple Silicon / CPU support.** New `MODEL_BACKEND=local` runs DeepSeek-OCR in-process via `transformers`, no GPU server required. Also adds `MODEL_BACKEND=remote` for any OpenAI-compatible endpoint.
+- New CPU Docker profile and `Dockerfile.cpu`: `docker compose --profile cpu up -d`.
+- `scripts/start.sh` one-command launcher with smart defaults; `requirements-local.txt` for the optional `transformers`/`torch` stack.
+- **HuggingFace OAuth** ("Sign in with HuggingFace"). When configured, gates the publish action on a real HF login and uses the user's own write token. Falls back to paste-token mode when OAuth is unset.
+- Publish modal now prefills `username/run-name` and adds the `opencr` discoverability tag to dataset cards.
+
+### Changed
+- **Breaking:** `docker compose up` no longer starts services without an explicit profile. Use `--profile gpu` (vLLM, NVIDIA) or `--profile cpu` (in-process transformers).
+- `INPUT_DIR` / `OUTPUT_DIR` default to `./input` / `./output` outside Docker, `/data/...` inside.
+- OpenAPI metadata now declares Apache-2.0; UI footer no longer claims "All rights reserved".
+
+### Fixed
+- `.gitignore` now covers `.DS_Store`, IDE folders, lint caches, and HF caches.
+
+---
+
+## How to release
+
+1. Decide the next version (`MAJOR.MINOR.PATCH`).
+2. Move the `[Unreleased]` block to a new `## [X.Y.Z] — YYYY-MM-DD` heading.
+3. Bump `pipeline_version` in `ocr_pipeline/config.py` to match.
+4. Commit: `git commit -am "release: vX.Y.Z"`.
+5. Tag: `git tag vX.Y.Z && git push --tags`.
+6. GitHub auto-creates a release page from the tag; paste the changelog entry into it.
+
+Bump rules:
+- **PATCH** for bug fixes that don't change behavior.
+- **MINOR** for backwards-compatible features.
+- **MAJOR** for breaking changes (env var renames, removed endpoints, behavior shifts users have to adapt to).
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..f429790
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,71 @@
+# Contributing to OpenCR
+
+Thanks for considering a contribution.
+
+OpenCR exists to make Turkish-language and archival OCR easy to run,
+share, and improve — every PR, issue, and dataset published with OpenCR
+helps that goal.
+
+## Ways to help
+
+- **File issues.** Found a bug, a layout that OpenCR struggles with,
+or a confusing piece of docs? Open an issue with a short PDF
+(or page screenshot) we can reproduce against.
+
+- **Add Turkish-language test fixtures.** A small public-domain PDF + the
+expected text is one of the highest-leverage contributions.
+
+- **Benchmarks.** Compare OpenCR against Tesseract / Surya /
+PaddleOCR / Marker on a Turkish corpus and post the table — even
+informal numbers are useful.
+
+- **Model-backend ports.** MLX, llama.cpp, ONNX, or any other runtime
+that improves throughput on a target platform.
+
+- **Translations.** README and dataset cards in additional languages.
+
+## Setup
+
+```bash
+git clone https://github.com/cdliai/opencr.git
+cd opencr
+make install
+make test
+```
+
+`make run` starts a local dev server on http://localhost:39672 with the `local` model backend (no GPU needed; ~5–30 s/page on M-series Macs).
+
+## Code style
+
+- Python: keep it boring and explicit. Type hints on public functions. No new dependencies without a brief rationale in the PR.
+
+- Frontend: stays Alpine + plain CSS until the state model genuinely outgrows it. No build step, no framework rewrite.
+
+- Tests: every new code path should have a unit or integration test. We use `pytest` and `pytest-asyncio`.
+
+## Pull request flow
+
+1. Open an issue first for non-trivial changes — a 5-line discussion saves a 500-line rewrite.
+
+2. Branch from `main`, name it `feat/...` or `fix/...`.
+
+3. Run `make lint test` before pushing.
+
+4. PR description: what changed, what it fixes, how to verify locally.
+
+## Reporting OCR-quality regressions
+
+If a particular PDF regresses after a change, please attach
+(or link to a public copy of) the PDF, the page number, what
+OpenCR produced, and what was expected. Quality bugs without
+a reproducer are very hard to act on.
+
+## Code of conduct
+
+Be respectful. We're a small project trying to do useful work for
+Turkish-language NLP — no room for harassment or
+discrimination here.
+
+## License
+
+By submitting a PR, you agree your contribution is licensed under the project's [Apache 2.0 License](./LICENSE).
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..976006c
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for describing the origin of the Work and
+ reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Support. While redistributing the Work or
+ Derivative Works thereof, You may choose to offer, and charge a
+ fee for, acceptance of support, warranty, indemnity, or other
+ liability obligations and/or rights consistent with this License.
+ However, in accepting such obligations, You may act only on Your
+ own behalf and on Your sole responsibility, not on behalf of any
+ other Contributor, and only if You agree to indemnify, defend,
+ and hold each Contributor harmless for any liability incurred by,
+ or claims asserted against, such Contributor by reason of your
+ accepting any such warranty or support.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright 2025 cdli.ai and OpenCR contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied. See the License for the specific language governing permissions
+ and limitations under the License.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..7455fb2
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,49 @@
+.PHONY: help install install-local run run-remote test lint format docker-up docker-down clean
+
+PY ?= python3
+VENV ?= .venv
+PIP := $(VENV)/bin/pip
+PYBIN := $(VENV)/bin/python
+
+help:
+ @echo "OpenCR developer targets:"
+ @echo " make install # venv + base deps + local-backend deps (Mac/CPU friendly)"
+ @echo " make run # start dev server on http://localhost:39672 with the local backend"
+ @echo " make run-remote # start dev server pointing at MODEL_SERVER_URL"
+ @echo " make test # run pytest suite"
+ @echo " make lint # ruff check"
+ @echo " make format # ruff format"
+ @echo " make docker-up # docker compose up (NVIDIA GPU profile)"
+ @echo " make docker-down # docker compose down"
+
+$(VENV):
+ $(PY) -m venv $(VENV)
+ $(PIP) install -U pip
+
+install: $(VENV)
+ $(PIP) install -r ocr_pipeline/requirements.txt -r requirements-local.txt
+ $(PIP) install pytest pytest-asyncio ruff
+
+run: $(VENV)
+ MODEL_BACKEND=local $(PYBIN) -m uvicorn ocr_pipeline.main:app --host 0.0.0.0 --port 39672 --reload
+
+run-remote: $(VENV)
+ MODEL_BACKEND=remote $(PYBIN) -m uvicorn ocr_pipeline.main:app --host 0.0.0.0 --port 39672 --reload
+
+test: $(VENV)
+ PYTHONPATH=. $(PYBIN) -m pytest -q
+
+lint: $(VENV)
+ $(VENV)/bin/ruff check ocr_pipeline tests scripts
+
+format: $(VENV)
+ $(VENV)/bin/ruff format ocr_pipeline tests scripts
+
+docker-up:
+ docker compose --profile gpu up -d
+
+docker-down:
+ docker compose down
+
+clean:
+ rm -rf $(VENV) .pytest_cache .ruff_cache **/__pycache__
diff --git a/README.md b/README.md
index 7c7ef0d..cc85549 100644
--- a/README.md
+++ b/README.md
@@ -1,43 +1,160 @@
-#
OpenCR: Türkçe ve Karmaşık Dökümanlar İçin Yüksek Performanslı OCR Hattı
+#
OpenCR
-OpenCR, özellikle Türkçe metinler, arşiv dökümanları ve karmaşık sayfa yapısına sahip PDF'leri, yapay zeka eğitimine hazır (HuggingFace-ready) tertemiz veri setlerine dönüştüren uçtan uca bir sistemdir.
+> High-performance OCR pipeline for Turkish, archival, and complex-layout documents — turning PDFs into HuggingFace-ready training datasets.
-## Neden OpenCR?
+OpenCR is an end-to-end open-source pipeline that converts PDFs (especially Turkish text, archival material, and pages with complex layout) into clean Parquet datasets ready for LLM training and retrieval.
-- **Türkçe Odaklı Doğruluk:** DeepSeek-OCR tabanlı yapısıyla, standart OCR araçlarının zorlandığı Türkçe karakterlerde ve karmaşık sayfa düzenlerinde üstün performans sağlar.
-- **Veri Seti Fabrikası:** Çıkarılan metinleri doğrudan `.parquet` formatında paketler ve tek tıkla HuggingFace'e yüklemeye hazır hale getirir.
-- **Operatör Konsolu:** İşlemleri izlemek, sayfa sayfa kontrol etmek ve hataları düzeltmek için modern bir web arayüzü sunar.
+For Turkish documents, see: [README.tr.md](./README.tr.md)
-## Kurulum
+---
+
+## Why OpenCR?
+
+- **Turkish-first accuracy.** Built around DeepSeek-OCR, it handles Turkish characters and difficult page layouts better than off-the-shelf OCR.
+- **Dataset factory.** Outputs are packaged directly as `pages.parquet` + `documents.parquet` with deterministic train/validation/test splits and a HuggingFace dataset card.
+- **Operator console.** A single-page web UI to monitor runs, page-by-page validate quality, retry, and publish to HuggingFace.
+- **Pluggable backends.** Production-grade NVIDIA + vLLM by default; runs in-process on Apple Silicon / CPU for development; or talk to any OpenAI-compatible model server.
+
+---
+
+## Quickstart
+
+### Option 1 — Docker (NVIDIA GPU, fastest path to inference)
+
+Requires Docker, an NVIDIA GPU, and the NVIDIA Container Toolkit.
+
+```bash
+docker compose up -d
+```
+
+Open http://localhost:39672. Drop PDFs in `./input/`, hit **Start OCR run**.
+
+### Option 2 — Apple Silicon / CPU (in-process inference, no GPU needed)
+
+For local development, demos, and small jobs on a Mac or Linux box with no GPU.
-### Docker ile Çalıştırma (GPU Gerekir)
```bash
-docker-compose up -d
+git clone https://github.com/cdliai/opencr.git
+cd opencr
+python3 -m venv .venv && source .venv/bin/activate
+pip install -r ocr_pipeline/requirements.txt -r requirements-local.txt
+MODEL_BACKEND=local ./scripts/start.sh
```
-### Lokal Geliştirme ve Web Arayüzü (Apple Silicon / CPU)
-Pipeline arayüzünü Apple bilgisayarınızda veya CPU üzerinde denemek için:
+Open http://localhost:39672. The DeepSeek-OCR model (~6 GB) downloads
+on first request and runs in-process via `transformers` on MPS (Apple Silicon)
+or CPU. Expect **5–30 seconds per page on M-series, much slower on CPU** —
+fine for development, not for production batch jobs.
-1. **Klasör ve Ortam Hazırlığı:**
- ```bash
- mkdir -p input output
- python3 -m venv .venv
- source .venv/bin/activate
- pip install -r ocr_pipeline/requirements.txt
- ```
+### Option 3 — Remote model server (point at any OpenAI-compatible endpoint)
-2. **Başlatma:**
- ```bash
- export INPUT_DIR="./input"
- export OUTPUT_DIR="./output"
- export PYTHONPATH=$PYTHONPATH:.
- python3 ocr_pipeline/main.py
- ```
- Erişim: **http://localhost:39672**
+If you already run vLLM somewhere, or use OpenRouter, or another endpoint
+serving DeepSeek-OCR:
-## Mimari
-- **Backend:** vLLM tabanlı DeepSeek-OCR (Ağır iş yükü).
-- **Frontend/API:** FastAPI & Alpine.js (Yönetim konsolu).
+```bash
+pip install -r ocr_pipeline/requirements.txt
+MODEL_BACKEND=remote MODEL_SERVER_URL=https://your-endpoint MODEL_API_KEY=sk-... ./scripts/start.sh
+```
+
+---
+
+## Configuration
+
+Configurable via environment variables (or a `.env` file):
+
+| Variable | Default | Description |
+| -------------------- | -------------------------------- | ------------------------------------------------------------------------------------------------- |
+| `MODEL_BACKEND` | `vllm` | `vllm` (NVIDIA, OpenAI-compatible server), `local` (in-process transformers), `remote` (alias). |
+| `MODEL_SERVER_URL` | `http://ocr-model:39671` | Base URL for `vllm` / `remote` backends. |
+| `MODEL_NAME` | `deepseek-ai/DeepSeek-OCR` | Model identifier. |
+| `MODEL_API_KEY` | `EMPTY` | API key for remote endpoints. |
+| `LOCAL_DEVICE` | auto | `auto`, `mps`, `cuda`, or `cpu` for the `local` backend. |
+| `INPUT_DIR` | `./input` (or `/data/input`) | Where to read PDFs from. |
+| `OUTPUT_DIR` | `./output` (or `/data/output`) | Where artifacts and the SQLite DB land. |
+| `HOST` / `PORT` | `0.0.0.0` / `39672` | Where the web console serves. |
+| `HF_OAUTH_CLIENT_ID` | unset | Enables "Sign in with HuggingFace" for the publish flow. See [HF OAuth setup](#hf-oauth-optional).|
+| `APP_SESSION_SECRET` | random per process | Cookie-signing secret. Set to a stable value in production. |
---
-*OpenCR, döküman arşivlerini dijitalleştirip modern yapay zeka dünyasına taşımak için [cdli.ai](https://cdli.ai) tarafından geliştirilmiştir.*
+
+## HuggingFace publishing
+
+Completed runs can be pushed to a HuggingFace dataset repo. Two modes:
+
+1. **Paste-token (default).** In the operator console, click **Publish to HuggingFace** and paste a HF write token. Or set `HF_TOKEN` in the server's environment to skip pasting.
+2. **Sign in with HuggingFace (recommended for shared deployments).** Configure OAuth (below) and users sign in with their HF account. The publish flow then uses their personal token automatically. This is also how the operator console gets gated — without a session, the publish action is hidden.
+
+### HF OAuth (optional)
+
+1. Create an OAuth app at https://huggingface.co/settings/connected-applications/new with redirect URI `https://your-host/api/auth/callback` and scopes `openid profile write-repos`.
+2. Set on the server:
+
+```bash
+export HF_OAUTH_CLIENT_ID=...
+export HF_OAUTH_CLIENT_SECRET=...
+export HF_OAUTH_REDIRECT_URI=https://your-host/api/auth/callback
+export APP_SESSION_SECRET=$(python3 -c 'import secrets; print(secrets.token_hex(32))')
+```
+
+3. Restart. The console gains a **Sign in with HuggingFace** button in the topbar.
+
+Published datasets are tagged `opencr` so they're discoverable via [HuggingFace's tag search](https://huggingface.co/datasets?other=opencr).
+
+---
+
+## Architecture
+
+```
+ ┌───────────────────────────────┐
+ │ OCR pipeline (FastAPI) │
+ PDFs ─────►. │ ingest → render → OCR → │ ──► pages.parquet
+ │ clean → validate → export │ documents.parquet
+ │ + operator console (Alpine) │ manifest.json
+ └──────────────┬────────────────┘
+ │ OpenAI-compatible
+ ▼
+ ┌───────────────────────────────┐
+ │ Model backend │
+ │ ┌─────────────────────────┐ │
+ │ │ vllm (NVIDIA, prod) │ │
+ │ │ local (MPS/CPU, dev) │ │
+ │ │ remote (any OpenAI URL) │ │
+ │ └─────────────────────────┘ │
+ └───────────────────────────────┘
+```
+
+State lives in SQLite + the filesystem.
+No external queue/broker is required for single-node operation.
+See [docs/architectural-overhaul-v2.md](./docs/architectural-overhaul-v2.md) for the long-form design.
+
+---
+
+## Development
+
+```bash
+make install # create venv, install deps
+make run # start dev server with sensible defaults
+make test # run pytest suite
+make lint # ruff check
+```
+
+Tests live under `tests/`. UI is plain HTML + Alpine.js — no build step.
+
+---
+
+## Contributing
+
+Contributions are welcome — bug reports, Turkish-language
+test fixtures, benchmarks against other OCR engines, model-backend
+ports (MLX, llama.cpp), and documentation translations are
+especially useful.
+
+See [CONTRIBUTING.md](./CONTRIBUTING.md).
+
+---
+
+## License
+
+Apache 2.0 — see [LICENSE](./LICENSE).
+
+OpenCR is built and maintained by [cdli.ai](https://cdli.ai) to support Turkish-language LLM research and dataset curation.
diff --git a/README.tr.md b/README.tr.md
new file mode 100644
index 0000000..7c7ef0d
--- /dev/null
+++ b/README.tr.md
@@ -0,0 +1,43 @@
+#
OpenCR: Türkçe ve Karmaşık Dökümanlar İçin Yüksek Performanslı OCR Hattı
+
+OpenCR, özellikle Türkçe metinler, arşiv dökümanları ve karmaşık sayfa yapısına sahip PDF'leri, yapay zeka eğitimine hazır (HuggingFace-ready) tertemiz veri setlerine dönüştüren uçtan uca bir sistemdir.
+
+## Neden OpenCR?
+
+- **Türkçe Odaklı Doğruluk:** DeepSeek-OCR tabanlı yapısıyla, standart OCR araçlarının zorlandığı Türkçe karakterlerde ve karmaşık sayfa düzenlerinde üstün performans sağlar.
+- **Veri Seti Fabrikası:** Çıkarılan metinleri doğrudan `.parquet` formatında paketler ve tek tıkla HuggingFace'e yüklemeye hazır hale getirir.
+- **Operatör Konsolu:** İşlemleri izlemek, sayfa sayfa kontrol etmek ve hataları düzeltmek için modern bir web arayüzü sunar.
+
+## Kurulum
+
+### Docker ile Çalıştırma (GPU Gerekir)
+```bash
+docker-compose up -d
+```
+
+### Lokal Geliştirme ve Web Arayüzü (Apple Silicon / CPU)
+Pipeline arayüzünü Apple bilgisayarınızda veya CPU üzerinde denemek için:
+
+1. **Klasör ve Ortam Hazırlığı:**
+ ```bash
+ mkdir -p input output
+ python3 -m venv .venv
+ source .venv/bin/activate
+ pip install -r ocr_pipeline/requirements.txt
+ ```
+
+2. **Başlatma:**
+ ```bash
+ export INPUT_DIR="./input"
+ export OUTPUT_DIR="./output"
+ export PYTHONPATH=$PYTHONPATH:.
+ python3 ocr_pipeline/main.py
+ ```
+ Erişim: **http://localhost:39672**
+
+## Mimari
+- **Backend:** vLLM tabanlı DeepSeek-OCR (Ağır iş yükü).
+- **Frontend/API:** FastAPI & Alpine.js (Yönetim konsolu).
+
+---
+*OpenCR, döküman arşivlerini dijitalleştirip modern yapay zeka dünyasına taşımak için [cdli.ai](https://cdli.ai) tarafından geliştirilmiştir.*
diff --git a/docker-compose.yml b/docker-compose.yml
index e4a77ed..f5c3577 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,5 +1,13 @@
+# Two profiles ship out of the box:
+#
+# docker compose --profile gpu up -d # production: vLLM model server + pipeline (NVIDIA)
+# docker compose --profile cpu up -d # CPU/Mac: pipeline only, in-process transformers backend
+#
+# Without an explicit --profile, no services run. Always pick one.
+
services:
ocr-model:
+ profiles: ["gpu"]
build: ./ocr-model
runtime: nvidia
restart: unless-stopped
@@ -31,6 +39,7 @@ services:
start_period: 600s
ocr-pipeline:
+ profiles: ["gpu"]
build: ./ocr_pipeline
restart: unless-stopped
ports:
@@ -39,6 +48,7 @@ services:
- ./input:/data/input
- ./output:/data/output
environment:
+ - MODEL_BACKEND=vllm
- MODEL_SERVER_URL=http://ocr-model:39671
- INPUT_DIR=/data/input
- OUTPUT_DIR=/data/output
@@ -46,5 +56,24 @@ services:
ocr-model:
condition: service_healthy
+ ocr-pipeline-cpu:
+ profiles: ["cpu"]
+ build:
+ context: .
+ dockerfile: ocr_pipeline/Dockerfile.cpu
+ restart: unless-stopped
+ ports:
+ - "39672:39672"
+ volumes:
+ - ./input:/data/input
+ - ./output:/data/output
+ - hf-cache:/root/.cache/huggingface
+ environment:
+ - MODEL_BACKEND=local
+ - LOCAL_DEVICE=cpu
+ - INPUT_DIR=/data/input
+ - OUTPUT_DIR=/data/output
+ - HF_HOME=/root/.cache/huggingface
+
volumes:
hf-cache:
diff --git a/ocr_pipeline/Dockerfile.cpu b/ocr_pipeline/Dockerfile.cpu
new file mode 100644
index 0000000..7b1aedb
--- /dev/null
+++ b/ocr_pipeline/Dockerfile.cpu
@@ -0,0 +1,34 @@
+# CPU-only image: pipeline + in-process transformers backend.
+# No vLLM, no NVIDIA runtime needed. Builds on any host.
+#
+# Build context is the repo root (set by docker-compose.yml) so we can pull in
+# requirements-local.txt alongside the pipeline source.
+#
+# The image is ~3 GB because it bundles torch + transformers; the model
+# weights themselves (~6 GB) download on first request and cache to the
+# hf-cache volume.
+FROM python:3.12-slim
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ poppler-utils \
+ git \
+ && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY ocr_pipeline/requirements.txt /tmp/requirements.txt
+COPY requirements-local.txt /tmp/requirements-local.txt
+# `--extra-index-url` pulls the CPU-only torch wheel.
+RUN pip install --no-cache-dir \
+ --extra-index-url https://download.pytorch.org/whl/cpu \
+ -r /tmp/requirements.txt -r /tmp/requirements-local.txt
+
+COPY ocr_pipeline /app/ocr_pipeline/
+
+ENV PYTHONPATH=/app
+ENV MODEL_BACKEND=local
+ENV LOCAL_DEVICE=cpu
+
+EXPOSE 39672
+
+CMD ["uvicorn", "ocr_pipeline.main:app", "--host", "0.0.0.0", "--port", "39672"]
diff --git a/ocr_pipeline/config.py b/ocr_pipeline/config.py
index c0b5193..4e4a019 100644
--- a/ocr_pipeline/config.py
+++ b/ocr_pipeline/config.py
@@ -1,18 +1,42 @@
-from pydantic_settings import BaseSettings
from pathlib import Path
+from typing import Literal
+
+from pydantic_settings import BaseSettings
+
+
+def _default_input_dir() -> Path:
+ """Use /data/input inside Docker, ./input on a developer machine."""
+ if Path("/data").is_dir():
+ return Path("/data/input")
+ return Path.cwd() / "input"
+
+
+def _default_output_dir() -> Path:
+ if Path("/data").is_dir():
+ return Path("/data/output")
+ return Path.cwd() / "output"
class Settings(BaseSettings):
- # Model server
+ # Model backend selection
+ # - "vllm" / "remote": call any OpenAI-compatible /v1/chat/completions server
+ # - "local" / "transformers": load DeepSeek-OCR in-process via transformers (Mac/CPU)
+ model_backend: Literal["vllm", "remote", "local", "transformers"] = "vllm"
model_server_url: str = "http://ocr-model:39671"
model_name: str = "deepseek-ai/DeepSeek-OCR"
+ model_api_key: str = "EMPTY"
model_timeout: float = 120.0
- # Startup readiness
- model_ready_timeout: int = 300 # Max seconds to wait for model server on startup
- model_ready_interval: int = 5 # Seconds between readiness checks
+ # Local backend (Apple Silicon / CPU)
+ local_device: Literal["auto", "mps", "cuda", "cpu"] = "auto"
+ local_dtype: Literal["auto", "float16", "bfloat16", "float32"] = "auto"
+ local_model_cache: Path = Path.home() / ".cache" / "huggingface"
+
+ # Startup readiness (used by the remote backend)
+ model_ready_timeout: int = 300
+ model_ready_interval: int = 5
- # NGram processor defaults
+ # NGram processor defaults (vLLM-only feature; ignored by local backend)
ngram_size: int = 30
window_size: int = 90
whitelist_token_ids: list[int] = [128821, 128822] #