Pulsefy · bytebinders · May 29, 2026 · May 30, 2026 · Jun 1, 2026
diff --git a/.github/workflows/ocr-regression.yml b/.github/workflows/ocr-regression.yml
@@ -0,0 +1,56 @@
+name: OCR Regression Test
+
+on:
+  push:
+    paths:
+      - 'app/ai-service/services/ocr.py'
+      - 'app/ai-service/services/preprocessing.py'
+      - 'app/ai-service/regression_harness/**'
+    branches: [ main, develop ]
+  pull_request:
+    paths:
+      - 'app/ai-service/services/ocr.py'
+      - 'app/ai-service/services/preprocessing.py'
+      - 'app/ai-service/regression_harness/**'
+    branches: [ main ]
+  workflow_dispatch:
+
+jobs:
+  regression:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.11'
+        cache: 'pip'
+
+    - name: Install System Dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y tesseract-ocr libtesseract-dev
+
+    - name: Install Python Dependencies
+      working-directory: ./app/ai-service
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install Pillow pytesseract
+
+    - name: Run OCR Regression Harness
+      working-directory: ./app/ai-service
+      run: |
+        export PYTHONPATH=$PYTHONPATH:.
+        python regression_harness/cli.py --output ocr_report.json
+
+    - name: Upload Regression Report
+      if: always()
+      uses: actions/upload-artifact@v4
+      with:
+        name: ocr-regression-report
+        path: app/ai-service/ocr_report.json
+        retention-days: 14
diff --git a/app/ai-service/regression_harness/README.md b/app/ai-service/regression_harness/README.md
@@ -0,0 +1,66 @@
+# OCR Regression Harness
+
+The OCR Regression Harness is a tool designed to prevent extraction accuracy regressions by running OCR against a "golden dataset" of representative documents and comparing the results to ground truth values.
+
+## Directory Structure
+
+- `regression_harness/`: Main package for the harness.
+  - `cli.py`: Command line interface.
+  - `evaluator.py`: Evaluation logic.
+  - `models.py`: Data models for samples and reports.
+  - `dataset/`: Contains the golden dataset.
+    - `documents/`: Folder for raw images (PNG, JPG).
+    - `ground_truth.json`: The source of truth for expected values.
+
+## How to Run Locally
+
+1. Ensure you are in the `app/ai-service` directory.
+2. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+3. Run the harness:
+   ```bash
+   export PYTHONPATH=.
+   python regression_harness/cli.py
+   ```
+   *Note: On Windows, use `set PYTHONPATH=.`*
+
+### CLI Options
+
+- `--dataset`: Path to ground truth JSON (default: `regression_harness/dataset/ground_truth.json`).
+- `--output`: Path to save a machine-readable JSON report.
+- `--threshold`: Minimum confidence threshold for fields (default: 0.8).
+
+## Adding New Golden Samples
+
+1. **Add the Image**: Place the document image in `regression_harness/dataset/documents/`.
+2. **Update Ground Truth**: Edit `regression_harness/dataset/ground_truth.json` to add a new entry in the `samples` array.
+
+```json
+{
+  "id": "item_001",
+  "image_path": "documents/item_001.png",
+  "expected_fields": {
+    "name": "EXACT EXPECTED NAME",
+    "id_number": "EXPECTED ID"
+  },
+  "metadata": {
+    "document_type": "passport",
+    "language": "en"
+  }
+}
+```
+
+## Error Classification
+
+Failures are categorized into one of these groups:
+- **Missing field**: A required field was not detected by the OCR service.
+- **Incorrect value**: The field was detected but the value didn't match the ground truth.
+- **Unexpected field**: OCR extracted a field that wasn't defined in the ground truth.
+- **Low confidence**: The field matched but OCR engine's confidence was below the threshold.
+- **Image not found**: The specified image path in ground truth is invalid.
+
+## CI Integration
+
+The harness runs automatically on every PR that touches OCR logic or the regression harness itself via `.github/workflows/ocr-regression.yml`. If the accuracy falls below 100% (or if any sample fails), the CI job will fail.
diff --git a/app/ai-service/regression_harness/__init__.py b/app/ai-service/regression_harness/__init__.py
@@ -0,0 +1 @@
+# OCR Regression Harness package
diff --git a/app/ai-service/regression_harness/cli.py b/app/ai-service/regression_harness/cli.py
@@ -0,0 +1,88 @@
+import os
+import json
+import argparse
+from typing import List
+from regression_harness.models import EvaluationSample, BoundingBox
+from regression_harness.evaluator import OCREvaluator
+
+def load_samples(ground_truth_path: str) -> List[EvaluationSample]:
+    with open(ground_truth_path, 'r') as f:
+        data = json.load(f)
+
+    samples = []
+    for s in data.get("samples", []):
+        bboxes = {
+            k: BoundingBox.from_dict(v) 
+            for k, v in s.get("expected_bboxes", {}).items()
+        }
+        samples.append(EvaluationSample(
+            id=s["id"],
+            image_path=s["image_path"],
+            expected_fields=s["expected_fields"],
+            expected_bboxes=bboxes,
+            metadata=s.get("metadata", {})
+        ))
+    return samples
+
+def print_summary(report):
+    print("\n" + "="*50)
+    print(" OCR REGRESSION HARNESS SUMMARY")
+    print("="*50)
+    print(f"Total Samples:    {report.total_samples}")
+    print(f"Passed:           {report.passed_samples}")
+    print(f"Failed:           {report.failed_samples}")
+    print(f"Accuracy:         {report.accuracy_percentage:.2f}%")
+    print("-" * 50)
+    print("Error breakdown:")
+    for err, count in report.error_counts.items():
+        if count > 0:
+            print(f"  {err:20}: {count}")
+    print("="*50 + "\n")
+
+    if report.failed_samples > 0:
+        print("FAILED SAMPLES DETAILS:")
+        for res in report.sample_results:
+            if not res.passed:
+                print(f"\n[!] Sample ID: {res.sample_id}")
+                for eval in res.field_evaluations:
+                    if not eval.is_match:
+                        print(f"    - {eval.field_name}: Expected '{eval.expected_value}', Got '{eval.actual_value}' (Error: {eval.error_type})")
+        print("\n" + "="*50)
+
+def main():
+    parser = argparse.ArgumentParser(description="OCR Regression Harness")
+    parser.add_argument("--dataset", default="regression_harness/dataset/ground_truth.json", help="Path to ground truth JSON")
+    parser.add_argument("--output", help="Path to save JSON report")
+    parser.add_argument("--threshold", type=float, default=0.8, help="Confidence threshold")
+
+    args = parser.parse_args()
+
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    # Adjust base_dir if it's currently inside regression_harness
+    if base_dir.endswith("regression_harness"):
+        base_dir = os.path.dirname(base_dir)
+        # We want base_dir to be app/ai-service
+
+    gt_path = os.path.join(base_dir, args.dataset)
+    if not os.path.exists(gt_path):
+        print(f"Error: Dataset not found at {gt_path}")
+        return
+
+    samples = load_samples(gt_path)
+    evaluator = OCREvaluator(tolerance_threshold=args.threshold)
+
+    print(f"Running evaluation on {len(samples)} samples...")
+    report = evaluator.run_suite(samples, os.path.dirname(gt_path))
+
+    print_summary(report)
+
+    if args.output:
+        with open(args.output, 'w') as f:
+            json.dump(report.to_dict(), f, indent=2)
+        print(f"Report saved to {args.output}")
+
+    if report.failed_samples > 0:
+        exit(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/app/ai-service/regression_harness/dataset/__init__.py b/app/ai-service/regression_harness/dataset/__init__.py
@@ -0,0 +1 @@
+# Dataset for OCR Regression Harness
diff --git a/app/ai-service/regression_harness/dataset/documents/sample_001.png b/app/ai-service/regression_harness/dataset/documents/sample_001.png
diff --git a/app/ai-service/regression_harness/dataset/ground_truth.json b/app/ai-service/regression_harness/dataset/ground_truth.json
@@ -0,0 +1,20 @@
+{
+  "samples": [
+    {
+      "id": "sample_001",
+      "image_path": "documents/sample_001.png",
+      "expected_fields": {
+        "name": "John Doe",
+        "date_of_birth": "15 Jan 1990",
+        "id_number": "AB123456"
+      },
+      "expected_bboxes": {
+        "name": {"x": 100, "y": 200, "width": 300, "height": 50}
+      },
+      "metadata": {
+        "document_type": "id_card",
+        "language": "en"
+      }
+    }
+  ]
+}
diff --git a/app/ai-service/regression_harness/evaluator.py b/app/ai-service/regression_harness/evaluator.py
@@ -0,0 +1,154 @@
+import os
+import sys
+import time
+import json
+from PIL import Image
+from typing import List, Dict, Any, Optional
+
+# Add the parent directory to path so we can import services
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from services.ocr import OCRService
+from regression_harness.models import (
+    EvaluationSample, SampleResult, FieldEvaluation, 
+    RegressionReport, BoundingBox
+)
+
+class OCREvaluator:
+    def __init__(self, tolerance_threshold: float = 0.8, iou_threshold: float = 0.5):
+        self.ocr_service = OCRService()
+        self.tolerance_threshold = tolerance_threshold
+        self.iou_threshold = iou_threshold
+
+    def evaluate_sample(self, sample: EvaluationSample, base_dir: str) -> SampleResult:
+        image_path = os.path.join(base_dir, sample.image_path)
+        if not os.path.exists(image_path):
+            return SampleResult(
+                sample_id=sample.id,
+                field_evaluations=[
+                    FieldEvaluation(
+                        field_name="all",
+                        expected_value=None,
+                        actual_value=None,
+                        is_match=False,
+                        error_type="image_not_found"
+                    )
+                ],
+                passed=False,
+                raw_text="",
+                processing_time_ms=0
+            )
+
+        image = Image.open(image_path)
+        result = self.ocr_service.process_image(image)
+
+        field_evals = []
+        all_passed = True
+
+        # Check expected fields
+        for field_name, expected_value in sample.expected_fields.items():
+            actual_match = result.fields.get(field_name)
+
+            if not actual_match:
+                field_evals.append(FieldEvaluation(
+                    field_name=field_name,
+                    expected_value=expected_value,
+                    actual_value=None,
+                    is_match=False,
+                    error_type="missing_field"
+                ))
+                all_passed = False
+            else:
+                actual_value = actual_match.value
+                is_match = self._compare_values(expected_value, actual_value)
+
+                error_type = None
+                if not is_match:
+                    error_type = "incorrect_value"
+                    all_passed = False
+
+                # Note: Simplified bbox check as current OCRService doesn't return bboxes per field in OCRResult yet.
+                # If it did, we would use _calculate_iou here.
+
+                field_evals.append(FieldEvaluation(
+                    field_name=field_name,
+                    expected_value=expected_value,
+                    actual_value=actual_value,
+                    is_match=is_match,
+                    error_type=error_type,
+                    confidence=actual_match.confidence
+                ))
+
+        # Check for unexpected fields
+        for field_name in result.fields.keys():
+            if field_name not in sample.expected_fields:
+                field_evals.append(FieldEvaluation(
+                    field_name=field_name,
+                    expected_value=None,
+                    actual_value=result.fields[field_name].value,
+                    is_match=False,
+                    error_type="unexpected_field"
+                ))
+
+        return SampleResult(
+            sample_id=sample.id,
+            field_evaluations=field_evals,
+            passed=all_passed,
+            raw_text=result.raw_text,
+            processing_time_ms=result.processing_time_ms
+        )
+
+    def _calculate_iou(self, box1: BoundingBox, box2: BoundingBox) -> float:
+        x1 = max(box1.x, box2.x)
+        y1 = max(box1.y, box2.y)
+        x2 = min(box1.x + box1.width, box2.x + box2.width)
+        y2 = min(box1.y + box1.height, box2.y + box2.height)
+
+        intersection = max(0, x2 - x1) * max(0, y2 - y1)
+        area1 = box1.width * box1.height
+        area2 = box2.width * box2.height
+        union = area1 + area2 - intersection
+
+        return intersection / union if union > 0 else 0
+
+    def _compare_values(self, expected: str, actual: str) -> bool:
+        if not expected or not actual:
+            return expected == actual
+        norm_expected = expected.strip().lower()
+        norm_actual = actual.strip().lower()
+        return norm_expected == norm_actual
+
+    def run_suite(self, samples: List[EvaluationSample], base_dir: str) -> RegressionReport:
+        results = []
+        error_counts = {
+            "missing_field": 0,
+            "incorrect_value": 0,
+            "unexpected_field": 0,
+            "image_not_found": 0,
+            "low_confidence": 0,
+            "bbox_mismatch": 0
+        }
+
+        for sample in samples:
+            res = self.evaluate_sample(sample, base_dir)
+            results.append(res)
+
+            for eval_item in res.field_evaluations:
+                if eval_item.error_type in error_counts:
+                    error_counts[eval_item.error_type] += 1
+
+                if eval_item.is_match and eval_item.confidence < self.tolerance_threshold:
+                    error_counts["low_confidence"] += 1
+
+        passed_count = sum(1 for r in results if r.passed)
+        total_count = len(samples)
+        accuracy = (passed_count / total_count * 100) if total_count > 0 else 0
+
+        return RegressionReport(
+            total_samples=total_count,
+            passed_samples=passed_count,
+            failed_samples=total_count - passed_count,
+            accuracy_percentage=accuracy,
+            error_counts=error_counts,
+            sample_results=results
+        )