From 309385af872906075ce5a6cf56757ccf72639832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ada=20=C3=87=C4=B1nar?= Date: Sat, 2 May 2026 23:10:55 +0300 Subject: [PATCH 01/14] add DS_Store --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index e3cb129..c6b98e4 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ output/ model-cache/ *.pdf !tests/fixtures/*.pdf +.DS_Store From 4776966deea2cd3ab0597911ead237cff176c32c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ada=20=C3=87=C4=B1nar?= Date: Sat, 2 May 2026 23:26:45 +0300 Subject: [PATCH 02/14] ci: add test control for python 3.11 & 3.12 --- .github/workflows/ci.yml | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..5690b06 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,39 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + + - name: Install system deps + run: sudo apt-get update && sudo apt-get install -y poppler-utils + + - name: Install Python deps + run: | + pip install -U pip + pip install -r ocr_pipeline/requirements.txt + pip install pytest pytest-asyncio ruff + + - name: Lint + run: ruff check ocr_pipeline tests scripts + + - name: Test + env: + PYTHONPATH: . + run: pytest -q From 2c571528f634e86b37f028626d2bd67f763d735a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ada=20=C3=87=C4=B1nar?= Date: Sat, 2 May 2026 23:34:18 +0300 Subject: [PATCH 03/14] profile selection for gpu and cpu on device selection added --- docker-compose.yml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index e4a77ed..f5c3577 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,13 @@ +# Two profiles ship out of the box: +# +# docker compose --profile gpu up -d # production: vLLM model server + pipeline (NVIDIA) +# docker compose --profile cpu up -d # CPU/Mac: pipeline only, in-process transformers backend +# +# Without an explicit --profile, no services run. Always pick one. + services: ocr-model: + profiles: ["gpu"] build: ./ocr-model runtime: nvidia restart: unless-stopped @@ -31,6 +39,7 @@ services: start_period: 600s ocr-pipeline: + profiles: ["gpu"] build: ./ocr_pipeline restart: unless-stopped ports: @@ -39,6 +48,7 @@ services: - ./input:/data/input - ./output:/data/output environment: + - MODEL_BACKEND=vllm - MODEL_SERVER_URL=http://ocr-model:39671 - INPUT_DIR=/data/input - OUTPUT_DIR=/data/output @@ -46,5 +56,24 @@ services: ocr-model: condition: service_healthy + ocr-pipeline-cpu: + profiles: ["cpu"] + build: + context: . + dockerfile: ocr_pipeline/Dockerfile.cpu + restart: unless-stopped + ports: + - "39672:39672" + volumes: + - ./input:/data/input + - ./output:/data/output + - hf-cache:/root/.cache/huggingface + environment: + - MODEL_BACKEND=local + - LOCAL_DEVICE=cpu + - INPUT_DIR=/data/input + - OUTPUT_DIR=/data/output + - HF_HOME=/root/.cache/huggingface + volumes: hf-cache: From a28f250cd823c3bf2c66c78ad50c5b60f87155a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ada=20=C3=87=C4=B1nar?= Date: Sat, 2 May 2026 23:38:23 +0300 Subject: [PATCH 04/14] docs: changed the doc from tr to eng --- README.md | 177 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 147 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 7c7ef0d..15507ed 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,160 @@ -# OpenCR: Türkçe ve Karmaşık Dökümanlar İçin Yüksek Performanslı OCR Hattı +# OpenCR -OpenCR, özellikle Türkçe metinler, arşiv dökümanları ve karmaşık sayfa yapısına sahip PDF'leri, yapay zeka eğitimine hazır (HuggingFace-ready) tertemiz veri setlerine dönüştüren uçtan uca bir sistemdir. +> High-performance OCR pipeline for Turkish, archival, and complex-layout documents — turning PDFs into HuggingFace-ready training datasets. -## Neden OpenCR? +OpenCR is an end-to-end open-source pipeline that converts PDFs (especially Turkish text, archival material, and pages with complex layout) into clean Parquet datasets ready for LLM training and retrieval. -- **Türkçe Odaklı Doğruluk:** DeepSeek-OCR tabanlı yapısıyla, standart OCR araçlarının zorlandığı Türkçe karakterlerde ve karmaşık sayfa düzenlerinde üstün performans sağlar. -- **Veri Seti Fabrikası:** Çıkarılan metinleri doğrudan `.parquet` formatında paketler ve tek tıkla HuggingFace'e yüklemeye hazır hale getirir. -- **Operatör Konsolu:** İşlemleri izlemek, sayfa sayfa kontrol etmek ve hataları düzeltmek için modern bir web arayüzü sunar. +For Turkish documents, see: [README.tr.md](./README.tr.md) -## Kurulum +--- + +## Why OpenCR? + +- **Turkish-first accuracy.** Built around DeepSeek-OCR, it handles Turkish characters and difficult page layouts better than off-the-shelf OCR. +- **Dataset factory.** Outputs are packaged directly as `pages.parquet` + `documents.parquet` with deterministic train/validation/test splits and a HuggingFace dataset card. +- **Operator console.** A single-page web UI to monitor runs, page-by-page validate quality, retry, and publish to HuggingFace. +- **Pluggable backends.** Production-grade NVIDIA + vLLM by default; runs in-process on Apple Silicon / CPU for development; or talk to any OpenAI-compatible model server. + +--- + +## Quickstart + +### Option 1 — Docker (NVIDIA GPU, fastest path to inference) + +Requires Docker, an NVIDIA GPU, and the NVIDIA Container Toolkit. + +```bash +docker compose up -d +``` + +Open http://localhost:39672. Drop PDFs in `./input/`, hit **Start OCR run**. + +### Option 2 — Apple Silicon / CPU (in-process inference, no GPU needed) + +For local development, demos, and small jobs on a Mac or Linux box with no GPU. -### Docker ile Çalıştırma (GPU Gerekir) ```bash -docker-compose up -d +git clone https://github.com/cdli-ai/opencr.git +cd opencr +python3 -m venv .venv && source .venv/bin/activate +pip install -r ocr_pipeline/requirements.txt -r requirements-local.txt +MODEL_BACKEND=local ./scripts/start.sh ``` -### Lokal Geliştirme ve Web Arayüzü (Apple Silicon / CPU) -Pipeline arayüzünü Apple bilgisayarınızda veya CPU üzerinde denemek için: +Open http://localhost:39672. The DeepSeek-OCR model (~6 GB) downloads +on first request and runs in-process via `transformers` on MPS (Apple Silicon) +or CPU. Expect **5–30 seconds per page on M-series, much slower on CPU** — +fine for development, not for production batch jobs. -1. **Klasör ve Ortam Hazırlığı:** - ```bash - mkdir -p input output - python3 -m venv .venv - source .venv/bin/activate - pip install -r ocr_pipeline/requirements.txt - ``` +### Option 3 — Remote model server (point at any OpenAI-compatible endpoint) -2. **Başlatma:** - ```bash - export INPUT_DIR="./input" - export OUTPUT_DIR="./output" - export PYTHONPATH=$PYTHONPATH:. - python3 ocr_pipeline/main.py - ``` - Erişim: **http://localhost:39672** +If you already run vLLM somewhere, or use OpenRouter, or another endpoint +serving DeepSeek-OCR: -## Mimari -- **Backend:** vLLM tabanlı DeepSeek-OCR (Ağır iş yükü). -- **Frontend/API:** FastAPI & Alpine.js (Yönetim konsolu). +```bash +pip install -r ocr_pipeline/requirements.txt +MODEL_BACKEND=remote MODEL_SERVER_URL=https://your-endpoint MODEL_API_KEY=sk-... ./scripts/start.sh +``` + +--- + +## Configuration + +Configurable via environment variables (or a `.env` file): + +| Variable | Default | Description | +| -------------------- | -------------------------------- | ------------------------------------------------------------------------------------------------- | +| `MODEL_BACKEND` | `vllm` | `vllm` (NVIDIA, OpenAI-compatible server), `local` (in-process transformers), `remote` (alias). | +| `MODEL_SERVER_URL` | `http://ocr-model:39671` | Base URL for `vllm` / `remote` backends. | +| `MODEL_NAME` | `deepseek-ai/DeepSeek-OCR` | Model identifier. | +| `MODEL_API_KEY` | `EMPTY` | API key for remote endpoints. | +| `LOCAL_DEVICE` | auto | `auto`, `mps`, `cuda`, or `cpu` for the `local` backend. | +| `INPUT_DIR` | `./input` (or `/data/input`) | Where to read PDFs from. | +| `OUTPUT_DIR` | `./output` (or `/data/output`) | Where artifacts and the SQLite DB land. | +| `HOST` / `PORT` | `0.0.0.0` / `39672` | Where the web console serves. | +| `HF_OAUTH_CLIENT_ID` | unset | Enables "Sign in with HuggingFace" for the publish flow. See [HF OAuth setup](#hf-oauth-optional).| +| `APP_SESSION_SECRET` | random per process | Cookie-signing secret. Set to a stable value in production. | --- -*OpenCR, döküman arşivlerini dijitalleştirip modern yapay zeka dünyasına taşımak için [cdli.ai](https://cdli.ai) tarafından geliştirilmiştir.* + +## HuggingFace publishing + +Completed runs can be pushed to a HuggingFace dataset repo. Two modes: + +1. **Paste-token (default).** In the operator console, click **Publish to HuggingFace** and paste a HF write token. Or set `HF_TOKEN` in the server's environment to skip pasting. +2. **Sign in with HuggingFace (recommended for shared deployments).** Configure OAuth (below) and users sign in with their HF account. The publish flow then uses their personal token automatically. This is also how the operator console gets gated — without a session, the publish action is hidden. + +### HF OAuth (optional) + +1. Create an OAuth app at https://huggingface.co/settings/connected-applications/new with redirect URI `https://your-host/api/auth/callback` and scopes `openid profile write-repos`. +2. Set on the server: + +```bash +export HF_OAUTH_CLIENT_ID=... +export HF_OAUTH_CLIENT_SECRET=... +export HF_OAUTH_REDIRECT_URI=https://your-host/api/auth/callback +export APP_SESSION_SECRET=$(python3 -c 'import secrets; print(secrets.token_hex(32))') +``` + +3. Restart. The console gains a **Sign in with HuggingFace** button in the topbar. + +Published datasets are tagged `opencr` so they're discoverable via [HuggingFace's tag search](https://huggingface.co/datasets?other=opencr). + +--- + +## Architecture + +``` + ┌───────────────────────────────┐ + │ OCR pipeline (FastAPI) │ + PDFs ─────►. │ ingest → render → OCR → │ ──► pages.parquet + │ clean → validate → export │ documents.parquet + │ + operator console (Alpine) │ manifest.json + └──────────────┬────────────────┘ + │ OpenAI-compatible + ▼ + ┌───────────────────────────────┐ + │ Model backend │ + │ ┌─────────────────────────┐ │ + │ │ vllm (NVIDIA, prod) │ │ + │ │ local (MPS/CPU, dev) │ │ + │ │ remote (any OpenAI URL) │ │ + │ └─────────────────────────┘ │ + └───────────────────────────────┘ +``` + +State lives in SQLite + the filesystem. +No external queue/broker is required for single-node operation. +See [docs/architectural-overhaul-v2.md](./docs/architectural-overhaul-v2.md) for the long-form design. + +--- + +## Development + +```bash +make install # create venv, install deps +make run # start dev server with sensible defaults +make test # run pytest suite +make lint # ruff check +``` + +Tests live under `tests/`. UI is plain HTML + Alpine.js — no build step. + +--- + +## Contributing + +Contributions are welcome — bug reports, Turkish-language +test fixtures, benchmarks against other OCR engines, model-backend +ports (MLX, llama.cpp), and documentation translations are +especially useful. + +See [CONTRIBUTING.md](./CONTRIBUTING.md). + +--- + +## License + +Apache 2.0 — see [LICENSE](./LICENSE). + +OpenCR is built and maintained by [cdli.ai](https://cdli.ai) to support Turkish-language LLM research and dataset curation. From 2cf4d6388e3b01f3971c9a0691685812a9f6f11c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ada=20=C3=87=C4=B1nar?= Date: Sat, 2 May 2026 23:38:55 +0300 Subject: [PATCH 05/14] docs: add turkish version of readme documentation --- README.tr.md | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 README.tr.md diff --git a/README.tr.md b/README.tr.md new file mode 100644 index 0000000..7c7ef0d --- /dev/null +++ b/README.tr.md @@ -0,0 +1,43 @@ +# OpenCR: Türkçe ve Karmaşık Dökümanlar İçin Yüksek Performanslı OCR Hattı + +OpenCR, özellikle Türkçe metinler, arşiv dökümanları ve karmaşık sayfa yapısına sahip PDF'leri, yapay zeka eğitimine hazır (HuggingFace-ready) tertemiz veri setlerine dönüştüren uçtan uca bir sistemdir. + +## Neden OpenCR? + +- **Türkçe Odaklı Doğruluk:** DeepSeek-OCR tabanlı yapısıyla, standart OCR araçlarının zorlandığı Türkçe karakterlerde ve karmaşık sayfa düzenlerinde üstün performans sağlar. +- **Veri Seti Fabrikası:** Çıkarılan metinleri doğrudan `.parquet` formatında paketler ve tek tıkla HuggingFace'e yüklemeye hazır hale getirir. +- **Operatör Konsolu:** İşlemleri izlemek, sayfa sayfa kontrol etmek ve hataları düzeltmek için modern bir web arayüzü sunar. + +## Kurulum + +### Docker ile Çalıştırma (GPU Gerekir) +```bash +docker-compose up -d +``` + +### Lokal Geliştirme ve Web Arayüzü (Apple Silicon / CPU) +Pipeline arayüzünü Apple bilgisayarınızda veya CPU üzerinde denemek için: + +1. **Klasör ve Ortam Hazırlığı:** + ```bash + mkdir -p input output + python3 -m venv .venv + source .venv/bin/activate + pip install -r ocr_pipeline/requirements.txt + ``` + +2. **Başlatma:** + ```bash + export INPUT_DIR="./input" + export OUTPUT_DIR="./output" + export PYTHONPATH=$PYTHONPATH:. + python3 ocr_pipeline/main.py + ``` + Erişim: **http://localhost:39672** + +## Mimari +- **Backend:** vLLM tabanlı DeepSeek-OCR (Ağır iş yükü). +- **Frontend/API:** FastAPI & Alpine.js (Yönetim konsolu). + +--- +*OpenCR, döküman arşivlerini dijitalleştirip modern yapay zeka dünyasına taşımak için [cdli.ai](https://cdli.ai) tarafından geliştirilmiştir.* From afad4a78c007d36486a2194bb5d6d7e5a1a4323c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ada=20=C3=87=C4=B1nar?= Date: Sat, 2 May 2026 23:43:17 +0300 Subject: [PATCH 06/14] feat: add HuggingFace OAuth session management, local dev startup scripts, and project licensing --- ocr_pipeline/routers/auth.py | 83 +++++++++++++++ ocr_pipeline/services/auth_session.py | 140 ++++++++++++++++++++++++++ ocr_pipeline/static/js/api.js | 12 +++ 3 files changed, 235 insertions(+) create mode 100644 ocr_pipeline/routers/auth.py create mode 100644 ocr_pipeline/services/auth_session.py diff --git a/ocr_pipeline/routers/auth.py b/ocr_pipeline/routers/auth.py new file mode 100644 index 0000000..ba6e2cd --- /dev/null +++ b/ocr_pipeline/routers/auth.py @@ -0,0 +1,83 @@ +"""HuggingFace OAuth routes. Mounted only when OAuth is configured. + +When OAuth is disabled the `/api/auth/me` endpoint still works and reports +`enabled: false` so the frontend can hide the sign-in button cleanly. +""" +from __future__ import annotations + +import logging + +from fastapi import APIRouter, HTTPException, Request +from fastapi.responses import RedirectResponse + +from ocr_pipeline.services.auth_session import ( + SESS_OAUTH_STATE, + build_authorize_url, + clear_session, + exchange_code, + is_oauth_enabled, + new_state, + session_user, + store_session, +) + +logger = logging.getLogger("ocr_pipeline.auth") +router = APIRouter() + + +@router.get("/api/auth/me") +async def auth_me(request: Request): + user = session_user(request.session) if hasattr(request, "session") else None + return { + "enabled": is_oauth_enabled(), + "authenticated": user is not None, + "user": ( + { + "name": user.name, + "picture": user.picture, + "profile": user.profile, + } + if user + else None + ), + } + + +@router.get("/api/auth/login") +async def auth_login(request: Request): + if not is_oauth_enabled(): + raise HTTPException(status_code=501, detail="HuggingFace OAuth is not configured.") + state = new_state() + request.session[SESS_OAUTH_STATE] = state + return RedirectResponse(build_authorize_url(state)) + + +@router.get("/api/auth/callback") +async def auth_callback(request: Request, code: str | None = None, state: str | None = None, error: str | None = None): + if error: + raise HTTPException(status_code=400, detail=f"HuggingFace returned error: {error}") + if not code: + raise HTTPException(status_code=400, detail="Missing authorization code.") + expected_state = request.session.pop(SESS_OAUTH_STATE, None) + if not expected_state or expected_state != state: + raise HTTPException(status_code=400, detail="OAuth state mismatch — request rejected.") + + try: + token_payload = await exchange_code(code) + except Exception as exc: + logger.exception("HF OAuth exchange failed") + raise HTTPException(status_code=502, detail=f"OAuth exchange failed: {exc}") + + store_session( + request.session, + access_token=token_payload["access_token"], + expires_in=token_payload.get("expires_in"), + userinfo=token_payload.get("userinfo") or {}, + ) + return RedirectResponse("/") + + +@router.post("/api/auth/logout") +async def auth_logout(request: Request): + clear_session(request.session) + return {"ok": True} diff --git a/ocr_pipeline/services/auth_session.py b/ocr_pipeline/services/auth_session.py new file mode 100644 index 0000000..a0482f0 --- /dev/null +++ b/ocr_pipeline/services/auth_session.py @@ -0,0 +1,140 @@ +"""HuggingFace OAuth helpers. + +Optional. When `HF_OAUTH_CLIENT_ID` is set, OpenCR enables a "Sign in with +HuggingFace" flow whose tokens drive the publish UI. When unset, the publish +flow falls back to the paste-token form (and `/api/auth/me` reports +`enabled: false`). + +Why OAuth and not just env-token-everywhere: +- Multi-user deployments shouldn't share one long-lived token. +- The token's repo permissions match the signed-in user, so users can only + push to repos they actually own. +- It's the basis for "gating" the panel — anonymous visitors see read-only. +""" +from __future__ import annotations + +import logging +import secrets +import time +from dataclasses import dataclass +from typing import Any + +import httpx + +from ocr_pipeline.config import settings + +logger = logging.getLogger("ocr_pipeline.auth") + +HF_AUTHORIZE_URL = "https://huggingface.co/oauth/authorize" +HF_TOKEN_URL = "https://huggingface.co/oauth/token" +HF_USERINFO_URL = "https://huggingface.co/oauth/userinfo" + +# Session cookie keys — kept short to fit the cookie size budget. +SESS_USER = "u" # dict: {name, picture, ...} +SESS_TOKEN = "t" # str: HF access token +SESS_EXPIRES_AT = "e" # float: epoch seconds +SESS_OAUTH_STATE = "s" # str: CSRF state during the redirect dance + + +@dataclass +class HFUser: + name: str + picture: str | None = None + profile: str | None = None + email: str | None = None + + +def is_oauth_enabled() -> bool: + return bool(settings.hf_oauth_client_id and settings.hf_oauth_client_secret) + + +def build_authorize_url(state: str) -> str: + from urllib.parse import urlencode + params = { + "client_id": settings.hf_oauth_client_id, + "redirect_uri": settings.hf_oauth_redirect_uri, + "response_type": "code", + "scope": settings.hf_oauth_scopes, + "state": state, + } + return f"{HF_AUTHORIZE_URL}?{urlencode(params)}" + + +def new_state() -> str: + return secrets.token_urlsafe(24) + + +async def exchange_code(code: str) -> dict[str, Any]: + """Trade an auth code for an access token + userinfo.""" + async with httpx.AsyncClient(timeout=15) as client: + token_resp = await client.post( + HF_TOKEN_URL, + data={ + "client_id": settings.hf_oauth_client_id, + "client_secret": settings.hf_oauth_client_secret, + "code": code, + "grant_type": "authorization_code", + "redirect_uri": settings.hf_oauth_redirect_uri, + }, + headers={"Accept": "application/json"}, + ) + token_resp.raise_for_status() + token_data = token_resp.json() + + access_token = token_data.get("access_token") + if not access_token: + raise RuntimeError("HF token endpoint returned no access_token") + + info_resp = await client.get( + HF_USERINFO_URL, + headers={"Authorization": f"Bearer {access_token}"}, + ) + info_resp.raise_for_status() + userinfo = info_resp.json() + + return { + "access_token": access_token, + "expires_in": token_data.get("expires_in"), + "userinfo": userinfo, + } + + +def store_session(session: dict, *, access_token: str, expires_in: int | None, userinfo: dict) -> None: + """Persist auth state on the request's session dict.""" + session[SESS_TOKEN] = access_token + session[SESS_USER] = { + "name": userinfo.get("preferred_username") or userinfo.get("name") or "anonymous", + "picture": userinfo.get("picture"), + "profile": userinfo.get("profile"), + "email": userinfo.get("email"), + "orgs": [o.get("name") for o in (userinfo.get("orgs") or []) if o.get("name")], + } + if expires_in: + session[SESS_EXPIRES_AT] = time.time() + int(expires_in) + + +def clear_session(session: dict) -> None: + for key in (SESS_USER, SESS_TOKEN, SESS_EXPIRES_AT, SESS_OAUTH_STATE): + session.pop(key, None) + + +def session_user(session: dict) -> HFUser | None: + if not session.get(SESS_TOKEN): + return None + expires_at = session.get(SESS_EXPIRES_AT) + if expires_at and time.time() > float(expires_at): + clear_session(session) + return None + user = session.get(SESS_USER) or {} + return HFUser( + name=user.get("name", "anonymous"), + picture=user.get("picture"), + profile=user.get("profile"), + email=user.get("email"), + ) + + +def session_token(session: dict) -> str | None: + if session_user(session) is None: + return None + return session.get(SESS_TOKEN) diff --git a/ocr_pipeline/static/js/api.js b/ocr_pipeline/static/js/api.js index 8050f1c..9b3d33a 100644 --- a/ocr_pipeline/static/js/api.js +++ b/ocr_pipeline/static/js/api.js @@ -123,4 +123,16 @@ const API = { } return res.json(); }, + + async authMe() { + const res = await fetch('/api/auth/me'); + if (!res.ok) return { enabled: false, authenticated: false, user: null }; + return res.json(); + }, + + async authLogout() { + const res = await fetch('/api/auth/logout', { method: 'POST' }); + if (!res.ok) throw new Error('Logout failed'); + return res.json(); + }, }; From 15be89794e135d9a4d7afae119c8cae7d23af32d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ada=20=C3=87=C4=B1nar?= Date: Sat, 2 May 2026 23:43:49 +0300 Subject: [PATCH 07/14] add package called 'itsdangerous' its not. --- ocr_pipeline/requirements.txt | 1 + requirements-local.txt | 12 ++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 requirements-local.txt diff --git a/ocr_pipeline/requirements.txt b/ocr_pipeline/requirements.txt index 6ef85ce..b7bddfa 100644 --- a/ocr_pipeline/requirements.txt +++ b/ocr_pipeline/requirements.txt @@ -12,3 +12,4 @@ httpx>=0.27.0 pyarrow>=18.0.0 aiosqlite>=0.20.0 huggingface-hub>=0.27.0 +itsdangerous>=2.2.0 diff --git a/requirements-local.txt b/requirements-local.txt new file mode 100644 index 0000000..bf4bf1d --- /dev/null +++ b/requirements-local.txt @@ -0,0 +1,12 @@ +# Optional dependencies for the in-process `local` model backend. +# Only needed when MODEL_BACKEND=local. Install on top of the base requirements: +# pip install -r ocr_pipeline/requirements.txt -r requirements-local.txt +# +# `torch` here is the CPU/MPS build. On NVIDIA boxes prefer the official +# CUDA-matched wheels from https://pytorch.org/ instead of letting pip pick. +torch>=2.4.0 +torchvision>=0.19.0 +transformers>=4.46.0 +accelerate>=0.34.0 +einops>=0.8.0 +sentencepiece>=0.2.0 From 3d775621ef9842e631d30a7d0c6cc87e4244788f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ada=20=C3=87=C4=B1nar?= Date: Sat, 2 May 2026 23:45:07 +0300 Subject: [PATCH 08/14] contributing guidelines documented --- CONTRIBUTING.md | 71 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..1e5b97b --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,71 @@ +# Contributing to OpenCR + +Thanks for considering a contribution. + +OpenCR exists to make Turkish-language and archival OCR easy to run, +share, and improve — every PR, issue, and dataset published with OpenCR +helps that goal. + +## Ways to help + +- **File issues.** Found a bug, a layout that OpenCR struggles with, +or a confusing piece of docs? Open an issue with a short PDF +(or page screenshot) we can reproduce against. + +- **Add Turkish-language test fixtures.** A small public-domain PDF + the +expected text is one of the highest-leverage contributions. + +- **Benchmarks.** Compare OpenCR against Tesseract / Surya / +PaddleOCR / Marker on a Turkish corpus and post the table — even +informal numbers are useful. + +- **Model-backend ports.** MLX, llama.cpp, ONNX, or any other runtime +that improves throughput on a target platform. + +- **Translations.** README and dataset cards in additional languages. + +## Setup + +```bash +git clone https://github.com/cdli-ai/opencr.git +cd opencr +make install +make test +``` + +`make run` starts a local dev server on http://localhost:39672 with the `local` model backend (no GPU needed; ~5–30 s/page on M-series Macs). + +## Code style + +- Python: keep it boring and explicit. Type hints on public functions. No new dependencies without a brief rationale in the PR. + +- Frontend: stays Alpine + plain CSS until the state model genuinely outgrows it. No build step, no framework rewrite. + +- Tests: every new code path should have a unit or integration test. We use `pytest` and `pytest-asyncio`. + +## Pull request flow + +1. Open an issue first for non-trivial changes — a 5-line discussion saves a 500-line rewrite. + +2. Branch from `main`, name it `feat/...` or `fix/...`. + +3. Run `make lint test` before pushing. + +4. PR description: what changed, what it fixes, how to verify locally. + +## Reporting OCR-quality regressions + +If a particular PDF regresses after a change, please attach +(or link to a public copy of) the PDF, the page number, what +OpenCR produced, and what was expected. Quality bugs without +a reproducer are very hard to act on. + +## Code of conduct + +Be respectful. We're a small project trying to do useful work for +Turkish-language NLP — no room for harassment or +discrimination here. + +## License + +By submitting a PR, you agree your contribution is licensed under the project's [Apache 2.0 License](./LICENSE). From 2cf37d33f21966a9c56c650f94181ead34d3abf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ada=20=C3=87=C4=B1nar?= Date: Sun, 3 May 2026 01:03:31 +0300 Subject: [PATCH 09/14] auth cluster style and bar added --- ocr_pipeline/static/css/style.css | 6 ++++++ ocr_pipeline/static/index.html | 30 ++++++++++++++++++++++----- ocr_pipeline/static/js/app.js | 34 ++++++++++++++++++++++++++++++- 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/ocr_pipeline/static/css/style.css b/ocr_pipeline/static/css/style.css index e7e5faf..2912a34 100644 --- a/ocr_pipeline/static/css/style.css +++ b/ocr_pipeline/static/css/style.css @@ -533,3 +533,9 @@ input[type="checkbox"] { width: 15px; height: 15px; accent-color: var(--accent); .run-summary { grid-template-columns: repeat(2, 1fr); } .toast-container { right: 10px; left: 10px; } } + +/* ---------------- auth cluster ---------------- */ +.auth-cluster { display: flex; align-items: center; gap: 10px; margin-left: 16px; } +.auth-user { display: flex; align-items: center; gap: 8px; } +.auth-avatar { width: 24px; height: 24px; border-radius: 50%; object-fit: cover; } +.auth-name { font-size: 0.82rem; font-weight: 600; color: var(--fg); } diff --git a/ocr_pipeline/static/index.html b/ocr_pipeline/static/index.html index 9a85fef..3529cf4 100644 --- a/ocr_pipeline/static/index.html +++ b/ocr_pipeline/static/index.html @@ -33,6 +33,16 @@

OpenCR

+
+ +
+ + + +
+
@@ -195,9 +205,16 @@

Run

- +

+ Publishing requires sign-in. +

@@ -306,15 +323,18 @@

Publish to HuggingFace

Repo id -