From d2ee65b46457c7282e2917468e2e8b4c36723043 Mon Sep 17 00:00:00 2001 From: Bytebinders Date: Fri, 29 May 2026 18:14:51 +0100 Subject: [PATCH 1/3] feat: implement OCR regression testing harness with CLI, evaluator, and automated workflow support --- .github/workflows/ocr-regression.yml | 56 +++++++ app/ai-service/regression_harness/README.md | 66 ++++++++ app/ai-service/regression_harness/__init__.py | 1 + app/ai-service/regression_harness/cli.py | 88 ++++++++++ .../regression_harness/dataset/__init__.py | 1 + .../dataset/ground_truth.json | 20 +++ .../regression_harness/evaluator.py | 154 ++++++++++++++++++ app/ai-service/regression_harness/models.py | 79 +++++++++ 8 files changed, 465 insertions(+) create mode 100644 .github/workflows/ocr-regression.yml create mode 100644 app/ai-service/regression_harness/README.md create mode 100644 app/ai-service/regression_harness/__init__.py create mode 100644 app/ai-service/regression_harness/cli.py create mode 100644 app/ai-service/regression_harness/dataset/__init__.py create mode 100644 app/ai-service/regression_harness/dataset/ground_truth.json create mode 100644 app/ai-service/regression_harness/evaluator.py create mode 100644 app/ai-service/regression_harness/models.py diff --git a/.github/workflows/ocr-regression.yml b/.github/workflows/ocr-regression.yml new file mode 100644 index 00000000..95d690f3 --- /dev/null +++ b/.github/workflows/ocr-regression.yml @@ -0,0 +1,56 @@ +name: OCR Regression Test + +on: + push: + paths: + - 'app/ai-service/services/ocr.py' + - 'app/ai-service/services/preprocessing.py' + - 'app/ai-service/regression_harness/**' + branches: [ main, develop ] + pull_request: + paths: + - 'app/ai-service/services/ocr.py' + - 'app/ai-service/services/preprocessing.py' + - 'app/ai-service/regression_harness/**' + branches: [ main ] + workflow_dispatch: + +jobs: + regression: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install System Dependencies + run: | + sudo apt-get update + sudo apt-get install -y tesseract-ocr libtesseract-dev + + - name: Install Python Dependencies + working-directory: ./app/ai-service + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install Pillow pytesseract + + - name: Run OCR Regression Harness + working-directory: ./app/ai-service + run: | + export PYTHONPATH=$PYTHONPATH:. + python regression_harness/cli.py --output ocr_report.json + + - name: Upload Regression Report + if: always() + uses: actions/upload-artifact@v4 + with: + name: ocr-regression-report + path: app/ai-service/ocr_report.json + retention-days: 14 diff --git a/app/ai-service/regression_harness/README.md b/app/ai-service/regression_harness/README.md new file mode 100644 index 00000000..9aeb91de --- /dev/null +++ b/app/ai-service/regression_harness/README.md @@ -0,0 +1,66 @@ +# OCR Regression Harness + +The OCR Regression Harness is a tool designed to prevent extraction accuracy regressions by running OCR against a "golden dataset" of representative documents and comparing the results to ground truth values. + +## Directory Structure + +- `regression_harness/`: Main package for the harness. + - `cli.py`: Command line interface. + - `evaluator.py`: Evaluation logic. + - `models.py`: Data models for samples and reports. + - `dataset/`: Contains the golden dataset. + - `documents/`: Folder for raw images (PNG, JPG). + - `ground_truth.json`: The source of truth for expected values. + +## How to Run Locally + +1. Ensure you are in the `app/ai-service` directory. +2. Install dependencies: + ```bash + pip install -r requirements.txt + ``` +3. Run the harness: + ```bash + export PYTHONPATH=. + python regression_harness/cli.py + ``` + *Note: On Windows, use `set PYTHONPATH=.`* + +### CLI Options + +- `--dataset`: Path to ground truth JSON (default: `regression_harness/dataset/ground_truth.json`). +- `--output`: Path to save a machine-readable JSON report. +- `--threshold`: Minimum confidence threshold for fields (default: 0.8). + +## Adding New Golden Samples + +1. **Add the Image**: Place the document image in `regression_harness/dataset/documents/`. +2. **Update Ground Truth**: Edit `regression_harness/dataset/ground_truth.json` to add a new entry in the `samples` array. + +```json +{ + "id": "item_001", + "image_path": "documents/item_001.png", + "expected_fields": { + "name": "EXACT EXPECTED NAME", + "id_number": "EXPECTED ID" + }, + "metadata": { + "document_type": "passport", + "language": "en" + } +} +``` + +## Error Classification + +Failures are categorized into one of these groups: +- **Missing field**: A required field was not detected by the OCR service. +- **Incorrect value**: The field was detected but the value didn't match the ground truth. +- **Unexpected field**: OCR extracted a field that wasn't defined in the ground truth. +- **Low confidence**: The field matched but OCR engine's confidence was below the threshold. +- **Image not found**: The specified image path in ground truth is invalid. + +## CI Integration + +The harness runs automatically on every PR that touches OCR logic or the regression harness itself via `.github/workflows/ocr-regression.yml`. If the accuracy falls below 100% (or if any sample fails), the CI job will fail. diff --git a/app/ai-service/regression_harness/__init__.py b/app/ai-service/regression_harness/__init__.py new file mode 100644 index 00000000..7400b35c --- /dev/null +++ b/app/ai-service/regression_harness/__init__.py @@ -0,0 +1 @@ +# OCR Regression Harness package diff --git a/app/ai-service/regression_harness/cli.py b/app/ai-service/regression_harness/cli.py new file mode 100644 index 00000000..b70dd4ca --- /dev/null +++ b/app/ai-service/regression_harness/cli.py @@ -0,0 +1,88 @@ +import os +import json +import argparse +from typing import List +from regression_harness.models import EvaluationSample, BoundingBox +from regression_harness.evaluator import OCREvaluator + +def load_samples(ground_truth_path: str) -> List[EvaluationSample]: + with open(ground_truth_path, 'r') as f: + data = json.load(f) + + samples = [] + for s in data.get("samples", []): + bboxes = { + k: BoundingBox.from_dict(v) + for k, v in s.get("expected_bboxes", {}).items() + } + samples.append(EvaluationSample( + id=s["id"], + image_path=s["image_path"], + expected_fields=s["expected_fields"], + expected_bboxes=bboxes, + metadata=s.get("metadata", {}) + )) + return samples + +def print_summary(report): + print("\n" + "="*50) + print(" OCR REGRESSION HARNESS SUMMARY") + print("="*50) + print(f"Total Samples: {report.total_samples}") + print(f"Passed: {report.passed_samples}") + print(f"Failed: {report.failed_samples}") + print(f"Accuracy: {report.accuracy_percentage:.2f}%") + print("-" * 50) + print("Error breakdown:") + for err, count in report.error_counts.items(): + if count > 0: + print(f" {err:20}: {count}") + print("="*50 + "\n") + + if report.failed_samples > 0: + print("FAILED SAMPLES DETAILS:") + for res in report.sample_results: + if not res.passed: + print(f"\n[!] Sample ID: {res.sample_id}") + for eval in res.field_evaluations: + if not eval.is_match: + print(f" - {eval.field_name}: Expected '{eval.expected_value}', Got '{eval.actual_value}' (Error: {eval.error_type})") + print("\n" + "="*50) + +def main(): + parser = argparse.ArgumentParser(description="OCR Regression Harness") + parser.add_argument("--dataset", default="regression_harness/dataset/ground_truth.json", help="Path to ground truth JSON") + parser.add_argument("--output", help="Path to save JSON report") + parser.add_argument("--threshold", type=float, default=0.8, help="Confidence threshold") + + args = parser.parse_args() + + base_dir = os.path.dirname(os.path.abspath(__file__)) + # Adjust base_dir if it's currently inside regression_harness + if base_dir.endswith("regression_harness"): + base_dir = os.path.dirname(base_dir) + # We want base_dir to be app/ai-service + + gt_path = os.path.join(base_dir, args.dataset) + if not os.path.exists(gt_path): + print(f"Error: Dataset not found at {gt_path}") + return + + samples = load_samples(gt_path) + evaluator = OCREvaluator(tolerance_threshold=args.threshold) + + print(f"Running evaluation on {len(samples)} samples...") + report = evaluator.run_suite(samples, os.path.dirname(gt_path)) + + print_summary(report) + + if args.output: + with open(args.output, 'w') as f: + json.dump(report.to_dict(), f, indent=2) + print(f"Report saved to {args.output}") + + if report.failed_samples > 0: + exit(1) + +if __name__ == "__main__": + main() diff --git a/app/ai-service/regression_harness/dataset/__init__.py b/app/ai-service/regression_harness/dataset/__init__.py new file mode 100644 index 00000000..bbf80acf --- /dev/null +++ b/app/ai-service/regression_harness/dataset/__init__.py @@ -0,0 +1 @@ +# Dataset for OCR Regression Harness diff --git a/app/ai-service/regression_harness/dataset/ground_truth.json b/app/ai-service/regression_harness/dataset/ground_truth.json new file mode 100644 index 00000000..6ee01932 --- /dev/null +++ b/app/ai-service/regression_harness/dataset/ground_truth.json @@ -0,0 +1,20 @@ +{ + "samples": [ + { + "id": "sample_001", + "image_path": "documents/sample_001.png", + "expected_fields": { + "name": "John Doe", + "date_of_birth": "15 Jan 1990", + "id_number": "AB123456" + }, + "expected_bboxes": { + "name": {"x": 100, "y": 200, "width": 300, "height": 50} + }, + "metadata": { + "document_type": "id_card", + "language": "en" + } + } + ] +} diff --git a/app/ai-service/regression_harness/evaluator.py b/app/ai-service/regression_harness/evaluator.py new file mode 100644 index 00000000..f2cea8ac --- /dev/null +++ b/app/ai-service/regression_harness/evaluator.py @@ -0,0 +1,154 @@ +import os +import sys +import time +import json +from PIL import Image +from typing import List, Dict, Any, Optional + +# Add the parent directory to path so we can import services +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from services.ocr import OCRService +from regression_harness.models import ( + EvaluationSample, SampleResult, FieldEvaluation, + RegressionReport, BoundingBox +) + +class OCREvaluator: + def __init__(self, tolerance_threshold: float = 0.8, iou_threshold: float = 0.5): + self.ocr_service = OCRService() + self.tolerance_threshold = tolerance_threshold + self.iou_threshold = iou_threshold + + def evaluate_sample(self, sample: EvaluationSample, base_dir: str) -> SampleResult: + image_path = os.path.join(base_dir, sample.image_path) + if not os.path.exists(image_path): + return SampleResult( + sample_id=sample.id, + field_evaluations=[ + FieldEvaluation( + field_name="all", + expected_value=None, + actual_value=None, + is_match=False, + error_type="image_not_found" + ) + ], + passed=False, + raw_text="", + processing_time_ms=0 + ) + + image = Image.open(image_path) + result = self.ocr_service.process_image(image) + + field_evals = [] + all_passed = True + + # Check expected fields + for field_name, expected_value in sample.expected_fields.items(): + actual_match = result.fields.get(field_name) + + if not actual_match: + field_evals.append(FieldEvaluation( + field_name=field_name, + expected_value=expected_value, + actual_value=None, + is_match=False, + error_type="missing_field" + )) + all_passed = False + else: + actual_value = actual_match.value + is_match = self._compare_values(expected_value, actual_value) + + error_type = None + if not is_match: + error_type = "incorrect_value" + all_passed = False + + # Note: Simplified bbox check as current OCRService doesn't return bboxes per field in OCRResult yet. + # If it did, we would use _calculate_iou here. + + field_evals.append(FieldEvaluation( + field_name=field_name, + expected_value=expected_value, + actual_value=actual_value, + is_match=is_match, + error_type=error_type, + confidence=actual_match.confidence + )) + + # Check for unexpected fields + for field_name in result.fields.keys(): + if field_name not in sample.expected_fields: + field_evals.append(FieldEvaluation( + field_name=field_name, + expected_value=None, + actual_value=result.fields[field_name].value, + is_match=False, + error_type="unexpected_field" + )) + + return SampleResult( + sample_id=sample.id, + field_evaluations=field_evals, + passed=all_passed, + raw_text=result.raw_text, + processing_time_ms=result.processing_time_ms + ) + + def _calculate_iou(self, box1: BoundingBox, box2: BoundingBox) -> float: + x1 = max(box1.x, box2.x) + y1 = max(box1.y, box2.y) + x2 = min(box1.x + box1.width, box2.x + box2.width) + y2 = min(box1.y + box1.height, box2.y + box2.height) + + intersection = max(0, x2 - x1) * max(0, y2 - y1) + area1 = box1.width * box1.height + area2 = box2.width * box2.height + union = area1 + area2 - intersection + + return intersection / union if union > 0 else 0 + + def _compare_values(self, expected: str, actual: str) -> bool: + if not expected or not actual: + return expected == actual + norm_expected = expected.strip().lower() + norm_actual = actual.strip().lower() + return norm_expected == norm_actual + + def run_suite(self, samples: List[EvaluationSample], base_dir: str) -> RegressionReport: + results = [] + error_counts = { + "missing_field": 0, + "incorrect_value": 0, + "unexpected_field": 0, + "image_not_found": 0, + "low_confidence": 0, + "bbox_mismatch": 0 + } + + for sample in samples: + res = self.evaluate_sample(sample, base_dir) + results.append(res) + + for eval_item in res.field_evaluations: + if eval_item.error_type in error_counts: + error_counts[eval_item.error_type] += 1 + + if eval_item.is_match and eval_item.confidence < self.tolerance_threshold: + error_counts["low_confidence"] += 1 + + passed_count = sum(1 for r in results if r.passed) + total_count = len(samples) + accuracy = (passed_count / total_count * 100) if total_count > 0 else 0 + + return RegressionReport( + total_samples=total_count, + passed_samples=passed_count, + failed_samples=total_count - passed_count, + accuracy_percentage=accuracy, + error_counts=error_counts, + sample_results=results + ) diff --git a/app/ai-service/regression_harness/models.py b/app/ai-service/regression_harness/models.py new file mode 100644 index 00000000..9fb1bcd6 --- /dev/null +++ b/app/ai-service/regression_harness/models.py @@ -0,0 +1,79 @@ +import json +import os +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Any + +@dataclass +class BoundingBox: + x: int + y: int + width: int + height: int + + def to_dict(self): + return {"x": self.x, "y": self.y, "width": self.width, "height": self.height} + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + +@dataclass +class EvaluationSample: + id: str + image_path: str + expected_fields: Dict[str, str] + expected_bboxes: Dict[str, BoundingBox] = field(default_factory=dict) + metadata: Dict[str, Any] = field(default_factory=dict) + +@dataclass +class FieldEvaluation: + field_name: str + expected_value: Optional[str] + actual_value: Optional[str] + is_match: bool + error_type: Optional[str] = None # 'missing_field', 'incorrect_value', 'unexpected_field' + confidence: float = 0.0 + +@dataclass +class SampleResult: + sample_id: str + field_evaluations: List[FieldEvaluation] + passed: bool + raw_text: str + processing_time_ms: int + +@dataclass +class RegressionReport: + total_samples: int + passed_samples: int + failed_samples: int + accuracy_percentage: float + error_counts: Dict[str, int] + sample_results: List[SampleResult] + + def to_dict(self): + return { + "summary": { + "total": self.total_samples, + "passed": self.passed_samples, + "failed": self.failed_samples, + "accuracy": self.accuracy_percentage, + "error_breakdown": self.error_counts + }, + "details": [ + { + "sample_id": r.sample_id, + "passed": r.passed, + "fields": [ + { + "name": f.field_name, + "expected": f.expected_value, + "actual": f.actual_value, + "match": f.is_match, + "error": f.error_type, + "confidence": f.confidence + } for f in r.field_evaluations + ] + } for r in self.sample_results + ] + } From 36e132691a620fc35bb4023619668f68781d6954 Mon Sep 17 00:00:00 2001 From: Bytebinders Date: Sat, 30 May 2026 10:08:46 +0100 Subject: [PATCH 2/3] feat: add sample_001.png to regression harness document dataset --- .../dataset/documents/sample_001.png | Bin 0 -> 11771 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 app/ai-service/regression_harness/dataset/documents/sample_001.png diff --git a/app/ai-service/regression_harness/dataset/documents/sample_001.png b/app/ai-service/regression_harness/dataset/documents/sample_001.png new file mode 100644 index 0000000000000000000000000000000000000000..27815d446afb26283efef614b87f3d696840d16e GIT binary patch literal 11771 zcmeHtWmr^iyYHx|NQoc_A|N0k-6$RY3F&T-MpC*#Wdw^ZX#p8ZK#)!)q*Fp-K)MI% zhJCZ&z4tk1pB-oK_d3`4a^?dUvu0+kXFbn-|LR$9RFq`!ac<)v5D0uZ*+*&!#6=wh z;sPu7MK}^}toI#(U~7)$SpS5_?KfY$l{(a?fUjAsV zPNvo~i2&xZ@&bEJn?YJ^_L8B@L*}>gwxxO!1A5IYH?y^Km|p%45<)5tA=$(y#O5bl zZ{I~Gd)B44WGT;g#`9P)+&Av-jQ=K^sRkDpXDJQEz!=13Bnjc|AR!X|=fk2wAXM-r zun`Cisek+MPnDVk6aG?VG5GChs76&{m(b z_K()d*Xw)?99V2u*p*Y+qu4CXZ|*K_iO|e$%=BPgq_f!VpeL2z-hR1^N%nD07MowDk1!+HIr>dl7*Gc+$w;H`zH{!YNg0G=)~mjcHPBH8od|$eQGo z)3*9E5s%qcs2XPuW@7QE9V&e*<w)u+7nkJ5BfAn?+VI&+O9rk}CNy-tDJiKbtgk~I8~OU)aX{|mMm~U?|XTr zy2xL-J`u2>ikg~&{o|UyT6?|gx zErNkM^Sy+8WpOJN2D!qy=ho7|8)2eH#Ay%oBvxde2Q zi>{C5^DvV=Vy=s2?>fF~=COC=;}iH-=W`h1U`flzJ}6%k9qk{?Qy%#hHBz;cOeKOh zFc>ctjIy>lJqhr$vB97ei`vWYkS%_n$#Ib=~f*YmHGEhlM$JWDoQE`#pj3 zKdKo7=-Nc}@(6OC!QGK4Z6n>;DcxNUINR!x5qucmfs>bKyU-nyovXH+o}EVHc(@%fzV$9Fpvb)KV0rm4 zxck~QjyU#OmGUGZIuuI!a3;(n>`o;eOJBd*+5XIn$L??PDk``Iv^&|_dN^#Qh0aS) z>7oR7vZ44lYUlbTm%`_z>Cn>cK@1EG$30@yBcwi0sZEQ_Pmi(hFO<~o8Kr*v<}@xS zA>kw^CpkCfyFmFO=!clU>42Z@ZF)ko5UP_s`kx(fJV?pDwy3=df%_Cyf!R87?4u~u z8QDct)&gG!Wq&C3!^L}q$lj5;g*bHGNm%NzyE_BECIb?W>{qpWne8 z$;e6!i(yyiwqEZB0V&0i?;a*`oQSWqp5qU?GOE^Vi(1hJ`k%t1sJNJcVP03=htDyE zZjYNzh?JE%_w`%vyDWFOo5sI?*EryJbK?uf<2pN7xL$dcjASQVsL3%BRyLPLe2Yo}O-RZ9SWxjG${9PiV)(v8dm8{b&Vnpoy{sU7u>auBkg$ zIwnIPcnOOp8&R2q(~mL?u}vto&9I1TCS||VU+y@}Kdwu`!V=Y)O0Bz>XmRnPX;joo z{Eu|9t)B8U^7WU&GKJ+6+|g_)!tr{ls$F$;u1j2d1e2>H1mkid?JVpCc-Q2}9&YyL zY14-cJLvfM&wgw?zq{>4lUBpd4q^d~uX7l5M^qQOw^s!hXVbdcuQYIf|5otz$*EX# z^P3JMM3e7Pv#9hB@k>vi<_Kt18aW%^aoJv+%=osvJeunNID#$V!yiV)<+1llxj9_R z%kO0eers3X&k+%G?w$H`hsM>Nfx%ZshR2~T@MBXtW@DX_WqO_JcxFYwOW4iBV+BeldVIV$T);kI{FA!z<(-8dkN2-D zCI^(3S8Vf*MA9863)Hht$5%}Z)Ww`XE_2c%yB$uv1u6^ChbMbwepLs9c|;hj!*Ak) zR5#0FS+rAH=@+$)L#DcwRaY~@R!1h;qIsRtL&jNE9s<*Hbuv&nZhd?%b2pd>AUzRv$;>rNG`kgNXLtFpE0Ze60Msi_DkBSW1W+f{91 zKfdT-{t5j_q*!nnu7wTP3QbATs`oea^SgSsAcpI)O2$#{-j%TEeO92d!pxkDjq}!a zr8fIcz*#^?*GgxZffd$7q&;qyI6mV%-+7b&Ce~K zJNnG*rrQ0DEuo%TK5M(rGwWi~ZCyf*P>O_@8jOk5bVklM}LR}-H{9^_vv9O-I4sA<6pWa_T_VlPSWuPD zmObv_iDx;VLJwOdg1=>|E9=Y3eRxNC3nsOe;T`}l`|!zEBs ze75NvMoDNIpY_iE{>=%Wbs3>*2lcGc(Azt)`St>4W|V$@DY6&q z_4NK)R){*-8Fsx~K2AhrWNG!z+}Od&{F8`V?pj+^yXg1gfHUKcxa!t<0WQ3A`r4QY zNopQic{@X{Qy@d%6$a{xHyICGcmF_=?jSGHrSE~#K+q&2@#0wpu1QIL@o{X zfAercaX^^Lcj_-7hZmEg56oWv-%-Q=p(g)l?aV9afBc}4v#+m@5`6>lJapJAkW*AN z#e^)#WOsF>k*zR0`w5xl=4>0~V3E>m5$!zfLJS`INlsoK1v&Z7`s8Fw3nR2sSy`Fc z(G^5*csB)-`@)3_rbQJQJ{J_#_4V~NG@?^d{EzoHu8Zz}L!(7FI0`d_E+bUJx~Y)e z-Q5o#K1@qXgNv&S zfkBz$WJ+qPvE!@J(NUL$Zp8%NwuS~t8JU*v-%ZvhYMtH8%bRxB#;nZD8W*-?LT_id-CKP+wokS@h`>P+}v?AdM<&Fm$xS1 ztnRa}yu3VB*f2X|#*Nj=Nc`rvZ{OP4+06=a5Mtxt;*Q{~UfbB+o!5Fo1}a2)2*vGo z(%R>(vhUf%te*OA& zX6ryQO+`zB=_VY;!Mf-kAwpy9yK%WZVxrb(Y;3H9G$tpbg2F&oR~Jcm>((uLpKQGa zYJ_0EfcYa?Y^P_>_V;JP!YGAe6B6`1>^(jA;f_PzzaJPFAPb7;x4%t;JZbCTGi|~i zS)mf~CX`G{PA)AesdxoLT3y-Rc1=^s&dF(ec1Og!q^9P0XJt6w|9Sq(($6dc0s>1* z%VHuP9vUG7%Y6U20>{Y%-PG@?%w9l6v)SaRH65O!X z-inAf-@M}=wahd#T55|7_dhwjKO&_mc82&UslkAqpPzqne4L)1PDDghF?aPk6)zuO zt@n~pmXjoRAUpu>tIlai8P&2{wk zS%|~)m6&c8^Eg`=2liTD*E~GF^xQ@@{_Z{G{?XCO%F1vuJ3vKRR#sMCp1ztI5edo8 zh*fH-ud%48DAXwQ^l+guR4?hfWnE3NWzAhjbDFBNi`yoSYl#g03i7I__(<5n%~_5NuMe$Exk5G z&&S7i>Cz>@Z%4;Vr(uTa&fB(9$u&~O?hmD-)bz(%f(35IK#z0G(PCiB@=AUHpVR3PBe(g5mIe)SM_RP#o z{F$n@wg5Nx@??E|SeUYghK9YpeN=*=-yv@!C;Zqkqrf>Im;-7H3LjtdH`T31PM{;d z!)@If&kZ2I#Hzww;evTNW##L5c&_g5mSjKL+aEjFtE%2m4~~zIpXMJSuJGPj=3n8X z3E4@CiI1;*HYEwG%Ep!t)lr~Vnp;v*B1eKvV`5?wcm=~Fd$vfEG zM@L74;Zj%sSy52|;OF5wysQc@9i0cRi@nn7D((g(BqY%2Ge1InP7Zh4q8P`rQr7}a z$e=~sR)&CDaIjtm<+ACQDYU0O;Y2$`FL|k`gsU(m343kU`s}rLbo~7JGfN@f8O}iM zzc0~-!(m*9hmGCa*Z0tC)7;DJV5TJunwd(}w*pS+E~tcfB1b}NPIsj2R9XheFZ199 zO;k)wvr%r9=LW65nz}kMIr&4U=|?UuF820oFD_lhjC_^Rnw*?8#)L0)CwCO&W@lsL z;jK>Ai-ooDaC0Z{+w1A+B}^O5&CRhHRt>;v!~OR6JGVvBpR++OG-}7%A^`7Ws#`un z&$i~fR!ucDZqm#G&8k25P!<5!kmD(b zBw_d&2(ek&*ycbh;L0W6wTm9Vrg4ms*Y}lNUaQn#rsTB_ud4C}1N-??O-)kE3_g{x zP8-fxxg#SI3k$22d3Ir8>)^P2!dt$r?!rGSiO(J5XOF0a?{i3YDH4 z=_*X-S9MZ-H`@dzeKRU3-#psHRIeW%9c5)^YHDebl9EEb4h;_Od=I>eZ?9%xAZ*(7 z8rG!Fd*?B2gt?_9W~{ttaIiExJ9{KpT~jkAHnz;Fhf1>bSCJXX`E7oCc^S9==TCYf zxeO1vusaXAt-6zr_SUsEHAOuCJOM#lR8&-43ak$e)`f9ddHeb`Cj@7GedL@^zQLH( zRaN^wb;bDZk)(!OhCj~4YaC#!_t|6j!4A=aqP&NER%j0}IM~&-25SIo!L~$!;C-`v znTDnbjZP-q{4*uxu4-tA?&x6Q=jRs_tFNfo-<<2X_~P!pd&uMuU%a;Fjr|W^5dup! z2L*U~KAU+&(l_=rN31NJ&ftPZj}lYtned z!vhqc^PGV8F{Y#B@ESS4osOGZX?gkH$s^ z54V>Z?T3E+2qN5E9?Xl~#z!Qe<^tRK`+GB0b2Qpd?*RBewI2Iw)k7qio{=%~SLL@S z_~ei-jT_KLJi}e+jL*o(P&YI*bo}b>?(QpU%yhH;F;X(EprAnf?vq<{FIB!fwe|J) z>wS450Rmb-YeGKb9VuI+ZSKU3DNDdc^R4lQzo{~oMNR#@y^ChBu%K>1VuL<*c6L@u z7PPRpR|GA4O|>`r<@GD_7_{TmwWo{yi_qQGu1gJm#MSO=hr45r#FA=iYSB5I=B!C3%}e|!}}QfJ`>Yj+M~o!UQsJwi_+xy=@HuvNBw_f}Z`GbQ?c70rdtP-IF2`#qSiEwYpI8h}H`nS0ufB zLqo$&F{0m7Q)#|O88Tt>?0V;dE}dcL3nkg@+bM!>?Ok16y}e(2_Aoq_olgQTJ@CL7 zM2)J>qaAmMX$i5RW|rpW2J`g=fOJ5e|N5m|LkRl|jm6xZ#>(tL;{JuU0SAP0Tb0mIQ&ST^KXD>(dV6^g z64c}@Z>-@`vZNvS(o z$m4TcrAp!+*iagXH{71`L>G&gT3Xl1PXSC7>B`B+%Y0SgZf4#wngx-P0n zwgR`AlNiM_ zxcp#-%ykhFk-75xy$X%j_4NVZDV?2*W$yuf34wGtTCZ2eHaW`^ZeI7N0F52b`&7-> zw+23MZy&92G6Tj-NEnSX_RleY1*Xs1#^%c9%gKUnx#{V&Fu;zEJdnm?W7U+0*AW4> z2lhYs)z&`|4SnD;1Nmyfx6-`@@eV1??6((}>|RYzn^ICzj_B=rRp;h1J4Cm8dV2B> zWWuT3)u`jd?=mqlJz?SXDaKm?oVH62N}-EBZ`9oK;^1HzD0_={;IW0NlqDGTJ6a>? zR;W->QP6$gUtV?_CZOO~U?pX-R(B{?$XKuAX8e>R68AXMd2d%|(r+($ljM`y(vJ{| zIp%F>Cy-g5uIde*4Y6z4ei=jA1Q6 zkHBQ9;~&0#`}Vu(4KX;W;=(doBus}<#WNGAZ6zfoU(reU7<4MS$_=2`A)6v7C-e)2$w{kqF%_PCPR^J7JRsqHX0@87>ae!R=b$mr+i zCz`sxxJU*s3W_|>Tx<$XVX81gyKS^19h|EE?DVL@X$B7u@BRDt6MlPWD0X&GDhD{o zaC{1WPzaL~6Dch-@gF{PCJKP5QF17_k0>dXM&WUGGVH#By;Z}*OTDjQB3|L%f^in; z6s2bzT|^`&bKgfrewBWqR9adJ^#D2@NoZxY5Q-r{)X)$jKXShh33>PK-8C9OqMeDx zO9<;LgB3E^;@Bo+D7X_It8U?dQ{kgUY=n3R};c#V%_WISADz!*^dO>mP3fhFVfh{!A`a4WtsFe~Ii2Vj2Yw49Vh|CIorvq5ELUA=NeP)LXe&&gd?xr@oAitE=+|Fhz^@ zBt87u(D0HF94xmk*aj0cse+>7T$$hQYCO!9K`H2G-j&%4;*6OJZ`~$tnADnAG7-H2 zrj(SD0-Bteo<=FBPAr!FOTxF-tE_pGl+@DJ_6Tz96@X`0h&+7>_f2ra@H=SYJpe_J zCGE`6LsnMh4(O_;rbjQtRmv{|)Pt6S{=N39Au=*DJDUmYDo`q&Ola#eFj40RE;?rB zbI1qD43X2;<|Z>6n}tgP3oY$5+#x`PVQp|@h;)udLUJ;j_V=-|qmi#x(A^+J;3wds zUc7h#{qi`O73Mu#hp>I|-_G==6;TTD!2iWSHxPY!vtV7l%7eGHzhUN(2B`p8L>X#1+kd7S zahSI}g?)UEdPI-j6RK%xF|cTBYb(pkn{k8;RZjtD>|)R>7>ont7=?vmU}JXoh%hnA zw{FFOi&ab!9_jA~|5pupo`i%1G92DHsK0vk3NQ|k;yk(ev2nW5LcOuEF+V>KVR43Z z2-wK7FJE-Z9g-m%<{iv=_g_xGZ)&w*`*JV6jEV z71&e@INJ-rFTHc)h6!wyXJqg^`S#qg?f{jZ{<1puq=eP5ssuoiZAlDKb3sEC`x*@f zgF%J^Tkz|-Ky8A!KZex?Z9New0&)PvHsmg1LPBHU6W}j&a|7}k1erJAJYrNz0(U!B z>m%S11#}Nhz$YXmBp|Q{Jv`Xkt5fNmxtg_AU|0=WmO&}$!-o${r0-muo!wV|(Ga~E z`mEb%KMPOpfO6|-Z;yPO39{IxjpmuAhDOo@r|H_g2_69fM&$nCVIBA}4Dax{XCPMo zSDxXu<$5^^#qi)k1r+bO=+JKteGX~l;qF>ucsL;`DZk6S!pP{P|Ir$RO>bM8dQ!!D zy1MeSvt{Mw$G(^ZR#yx8$bV=)4VuDW>ikc<`8kr4lPl{J5D1Y8?J=8Zx4Y$G%6F)X*(^ae_8wL#?N>z5Nr;^E_mQHwd4nPtEV_Q_C#--6&bwfaqA z;rc>qfCC_vscEtA!6v9!>xO~+ElI>@cfJ11NL`)I z)*Q+NNY!%Y?auNbOJ$J~&+*|(A(T1<-d4A2Yijm={dIIcOy?Wb`I6tdHN09oJTTCi zEF?frKL^t)C@6?yV`*gt{asyL9BIgU|GsXCbsq*@J@S;>jSG**?8kdp63~7SU_xs_ z06SUX6zCHdQQKPubJ>+B@YTx;;y5sZ_M;^^Ucr|Txhsm;4gZ+V0tRu%P~N#Cdb&RY z5nv_?#qSd~ZlVIV0-FZz2vULZ2JlE22FLmncl2K)wMG9D3@2{{fKS}q+=R5Vqiplf zA8Yqi5^P*txYih`!5;SIm6d?~so)zoZj2iecbk)g`36#gse=}2OL2S}2|iC*xd-al zu+|$aagoDiM@PpT4LvO_cYsuAOxRh`FUtUst*)pzk2uqQKyIj=e6v1!zDqV-#AQy3a+*;+W?8(h<88V)tZVFnl9W&}jZGKq?W*wh;WhT~&j z5b+LI;3G;g|C$L!US3{E!b)8hlz<@q{P}~zaB*-r!fu0rwT7zYCb+bAlfj6468h$;N0h6 z|LI(vg@izKSO4VuOF)uOpFW|eZXmLGW3Gc=X;tdeIIZ-wwY3Gt0@(o>t3&0RH`i#8 zN3-9cm7uOhm`|ds=YFsQCMO!M{!0#i^jsd@(yu=})fi0ic1kLkfUl}jhc(uR)~E>7 z;0-!ya|??T0O8%;-FCb4v`$mA8S|I>ixN4Am>}+hw3?k|sv9e|@znAXVq0bKE-MQQ z9Gobh^cOO34F}=}3kdkjQeRnLUk}_du5r^HGp%jx=XQ2_Qe0dtc6RIn8&Hb3f8Bou z(*ThHEbaW<90*a5qun(gety`Q;H@6|4_u7q?nF3*D}Vkx6A`J6j3lZ=r10C1f?GO+ zHmlNy9VIcJU5Ldk0Ki4Ox1U(~Gt+T#^?~OiB%-EvG&56oOy;v4f^UEd`iqq`E;<^v zd>{pnFEpw(6(}hyi(LT?MSC!*9=N*K0b2l9v9N$#!4=ok)IeJLRol;E*#8>MsyOF^ z2M=JY0aC^VU$)N1_bXuYxuK!3XTE#)xjs`O>Gu;$@?F8i z_j!4F;&;g);rlFEYfLPK9WUa;Uv3V8NwG_ZgW6ui8Qh@m2`s*M;CN%B)OY_(wVcdM z`mh#tPg)Wa>N*3k4^nxtth%(Co|~}g^8;-IF}K^wgNF}`Z0D$6gl}waMhdw(LoI=x zSr)-^AS@kRW0~@C5S|;_Z|8hp zRJ4J-M$B$lHM_Qkt}+}gvmXOx5kx>~1KX1`8$b>aob=Dm&Vo*18IgSU>=|fc*odpE z6HV>$aCQA#WjeOy>Yugl@6CwAP(xU>&Colg2BgaTU literal 0 HcmV?d00001 From 3394ea5814722ceb8a32a8da6a2b37889b04ebf3 Mon Sep 17 00:00:00 2001 From: Bytebinders Date: Mon, 1 Jun 2026 01:41:37 +0100 Subject: [PATCH 3/3] feat: implement OCRService with Tesseract integration and regex-based field detection --- app/ai-service/services/ocr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/ai-service/services/ocr.py b/app/ai-service/services/ocr.py index caf7118d..3c32a4f2 100644 --- a/app/ai-service/services/ocr.py +++ b/app/ai-service/services/ocr.py @@ -25,8 +25,8 @@ class OCRResult: class FieldDetector: PATTERNS = { "name": [ - r"(?:Full\s+)?[Nn]ame[:\s]+\n?([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)", - r"(?:Full\s+)?[Nn]ame[:\s]+([A-Z]+(?:\s+[A-Z]+)+)", + r"(?:Full\s+)?[Nn]ame[:\s]+\n?([A-Z][a-z]+(?:[ \t]+[A-Z][a-z]+)+)", + r"(?:Full\s+)?[Nn]ame[:\s]+\n?([A-Z]+(?:[ \t]+[A-Z]+)+)", ], "date_of_birth": [ r"[Dd]ate\s+(?:of\s+)?[Bb]irth[:\s]*(\d{2}[-./]\d{2}[-./]\d{4})",