Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions .github/workflows/ocr-regression.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: OCR Regression Test

on:
push:
paths:
- 'app/ai-service/services/ocr.py'
- 'app/ai-service/services/preprocessing.py'
- 'app/ai-service/regression_harness/**'
branches: [ main, develop ]
pull_request:
paths:
- 'app/ai-service/services/ocr.py'
- 'app/ai-service/services/preprocessing.py'
- 'app/ai-service/regression_harness/**'
branches: [ main ]
workflow_dispatch:

jobs:
regression:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'

- name: Install System Dependencies
run: |
sudo apt-get update
sudo apt-get install -y tesseract-ocr libtesseract-dev

- name: Install Python Dependencies
working-directory: ./app/ai-service
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install Pillow pytesseract

- name: Run OCR Regression Harness
working-directory: ./app/ai-service
run: |
export PYTHONPATH=$PYTHONPATH:.
python regression_harness/cli.py --output ocr_report.json

- name: Upload Regression Report
if: always()
uses: actions/upload-artifact@v4
with:
name: ocr-regression-report
path: app/ai-service/ocr_report.json
retention-days: 14
66 changes: 66 additions & 0 deletions app/ai-service/regression_harness/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# OCR Regression Harness

The OCR Regression Harness is a tool designed to prevent extraction accuracy regressions by running OCR against a "golden dataset" of representative documents and comparing the results to ground truth values.

## Directory Structure

- `regression_harness/`: Main package for the harness.
- `cli.py`: Command line interface.
- `evaluator.py`: Evaluation logic.
- `models.py`: Data models for samples and reports.
- `dataset/`: Contains the golden dataset.
- `documents/`: Folder for raw images (PNG, JPG).
- `ground_truth.json`: The source of truth for expected values.

## How to Run Locally

1. Ensure you are in the `app/ai-service` directory.
2. Install dependencies:
```bash
pip install -r requirements.txt
```
3. Run the harness:
```bash
export PYTHONPATH=.
python regression_harness/cli.py
```
*Note: On Windows, use `set PYTHONPATH=.`*

### CLI Options

- `--dataset`: Path to ground truth JSON (default: `regression_harness/dataset/ground_truth.json`).
- `--output`: Path to save a machine-readable JSON report.
- `--threshold`: Minimum confidence threshold for fields (default: 0.8).

## Adding New Golden Samples

1. **Add the Image**: Place the document image in `regression_harness/dataset/documents/`.
2. **Update Ground Truth**: Edit `regression_harness/dataset/ground_truth.json` to add a new entry in the `samples` array.

```json
{
"id": "item_001",
"image_path": "documents/item_001.png",
"expected_fields": {
"name": "EXACT EXPECTED NAME",
"id_number": "EXPECTED ID"
},
"metadata": {
"document_type": "passport",
"language": "en"
}
}
```

## Error Classification

Failures are categorized into one of these groups:
- **Missing field**: A required field was not detected by the OCR service.
- **Incorrect value**: The field was detected but the value didn't match the ground truth.
- **Unexpected field**: OCR extracted a field that wasn't defined in the ground truth.
- **Low confidence**: The field matched but OCR engine's confidence was below the threshold.
- **Image not found**: The specified image path in ground truth is invalid.

## CI Integration

The harness runs automatically on every PR that touches OCR logic or the regression harness itself via `.github/workflows/ocr-regression.yml`. If the accuracy falls below 100% (or if any sample fails), the CI job will fail.
1 change: 1 addition & 0 deletions app/ai-service/regression_harness/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# OCR Regression Harness package
88 changes: 88 additions & 0 deletions app/ai-service/regression_harness/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import os
import json
import argparse
from typing import List
from regression_harness.models import EvaluationSample, BoundingBox
from regression_harness.evaluator import OCREvaluator

def load_samples(ground_truth_path: str) -> List[EvaluationSample]:
with open(ground_truth_path, 'r') as f:
data = json.load(f)

samples = []
for s in data.get("samples", []):
bboxes = {
k: BoundingBox.from_dict(v)
for k, v in s.get("expected_bboxes", {}).items()
}
samples.append(EvaluationSample(
id=s["id"],
image_path=s["image_path"],
expected_fields=s["expected_fields"],
expected_bboxes=bboxes,
metadata=s.get("metadata", {})
))
return samples

def print_summary(report):
print("\n" + "="*50)
print(" OCR REGRESSION HARNESS SUMMARY")
print("="*50)
print(f"Total Samples: {report.total_samples}")
print(f"Passed: {report.passed_samples}")
print(f"Failed: {report.failed_samples}")
print(f"Accuracy: {report.accuracy_percentage:.2f}%")
print("-" * 50)
print("Error breakdown:")
for err, count in report.error_counts.items():
if count > 0:
print(f" {err:20}: {count}")
print("="*50 + "\n")

if report.failed_samples > 0:
print("FAILED SAMPLES DETAILS:")
for res in report.sample_results:
if not res.passed:
print(f"\n[!] Sample ID: {res.sample_id}")
for eval in res.field_evaluations:
if not eval.is_match:
print(f" - {eval.field_name}: Expected '{eval.expected_value}', Got '{eval.actual_value}' (Error: {eval.error_type})")
print("\n" + "="*50)

def main():
parser = argparse.ArgumentParser(description="OCR Regression Harness")
parser.add_argument("--dataset", default="regression_harness/dataset/ground_truth.json", help="Path to ground truth JSON")
parser.add_argument("--output", help="Path to save JSON report")
parser.add_argument("--threshold", type=float, default=0.8, help="Confidence threshold")

args = parser.parse_args()

base_dir = os.path.dirname(os.path.abspath(__file__))
# Adjust base_dir if it's currently inside regression_harness
if base_dir.endswith("regression_harness"):
base_dir = os.path.dirname(base_dir)
# We want base_dir to be app/ai-service

gt_path = os.path.join(base_dir, args.dataset)
if not os.path.exists(gt_path):
print(f"Error: Dataset not found at {gt_path}")
return

samples = load_samples(gt_path)
evaluator = OCREvaluator(tolerance_threshold=args.threshold)

print(f"Running evaluation on {len(samples)} samples...")
report = evaluator.run_suite(samples, os.path.dirname(gt_path))

print_summary(report)

if args.output:
with open(args.output, 'w') as f:
json.dump(report.to_dict(), f, indent=2)
print(f"Report saved to {args.output}")

if report.failed_samples > 0:
exit(1)

if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions app/ai-service/regression_harness/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Dataset for OCR Regression Harness
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
20 changes: 20 additions & 0 deletions app/ai-service/regression_harness/dataset/ground_truth.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"samples": [
{
"id": "sample_001",
"image_path": "documents/sample_001.png",
"expected_fields": {
"name": "John Doe",
"date_of_birth": "15 Jan 1990",
"id_number": "AB123456"
},
"expected_bboxes": {
"name": {"x": 100, "y": 200, "width": 300, "height": 50}
},
"metadata": {
"document_type": "id_card",
"language": "en"
}
}
]
}
154 changes: 154 additions & 0 deletions app/ai-service/regression_harness/evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import os
import sys
import time
import json
from PIL import Image
from typing import List, Dict, Any, Optional

# Add the parent directory to path so we can import services
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from services.ocr import OCRService
from regression_harness.models import (
EvaluationSample, SampleResult, FieldEvaluation,
RegressionReport, BoundingBox
)

class OCREvaluator:
def __init__(self, tolerance_threshold: float = 0.8, iou_threshold: float = 0.5):
self.ocr_service = OCRService()
self.tolerance_threshold = tolerance_threshold
self.iou_threshold = iou_threshold

def evaluate_sample(self, sample: EvaluationSample, base_dir: str) -> SampleResult:
image_path = os.path.join(base_dir, sample.image_path)
if not os.path.exists(image_path):
return SampleResult(
sample_id=sample.id,
field_evaluations=[
FieldEvaluation(
field_name="all",
expected_value=None,
actual_value=None,
is_match=False,
error_type="image_not_found"
)
],
passed=False,
raw_text="",
processing_time_ms=0
)

image = Image.open(image_path)
result = self.ocr_service.process_image(image)

field_evals = []
all_passed = True

# Check expected fields
for field_name, expected_value in sample.expected_fields.items():
actual_match = result.fields.get(field_name)

if not actual_match:
field_evals.append(FieldEvaluation(
field_name=field_name,
expected_value=expected_value,
actual_value=None,
is_match=False,
error_type="missing_field"
))
all_passed = False
else:
actual_value = actual_match.value
is_match = self._compare_values(expected_value, actual_value)

error_type = None
if not is_match:
error_type = "incorrect_value"
all_passed = False

# Note: Simplified bbox check as current OCRService doesn't return bboxes per field in OCRResult yet.
# If it did, we would use _calculate_iou here.

field_evals.append(FieldEvaluation(
field_name=field_name,
expected_value=expected_value,
actual_value=actual_value,
is_match=is_match,
error_type=error_type,
confidence=actual_match.confidence
))

# Check for unexpected fields
for field_name in result.fields.keys():
if field_name not in sample.expected_fields:
field_evals.append(FieldEvaluation(
field_name=field_name,
expected_value=None,
actual_value=result.fields[field_name].value,
is_match=False,
error_type="unexpected_field"
))

return SampleResult(
sample_id=sample.id,
field_evaluations=field_evals,
passed=all_passed,
raw_text=result.raw_text,
processing_time_ms=result.processing_time_ms
)

def _calculate_iou(self, box1: BoundingBox, box2: BoundingBox) -> float:
x1 = max(box1.x, box2.x)
y1 = max(box1.y, box2.y)
x2 = min(box1.x + box1.width, box2.x + box2.width)
y2 = min(box1.y + box1.height, box2.y + box2.height)

intersection = max(0, x2 - x1) * max(0, y2 - y1)
area1 = box1.width * box1.height
area2 = box2.width * box2.height
union = area1 + area2 - intersection

return intersection / union if union > 0 else 0

def _compare_values(self, expected: str, actual: str) -> bool:
if not expected or not actual:
return expected == actual
norm_expected = expected.strip().lower()
norm_actual = actual.strip().lower()
return norm_expected == norm_actual

def run_suite(self, samples: List[EvaluationSample], base_dir: str) -> RegressionReport:
results = []
error_counts = {
"missing_field": 0,
"incorrect_value": 0,
"unexpected_field": 0,
"image_not_found": 0,
"low_confidence": 0,
"bbox_mismatch": 0
}

for sample in samples:
res = self.evaluate_sample(sample, base_dir)
results.append(res)

for eval_item in res.field_evaluations:
if eval_item.error_type in error_counts:
error_counts[eval_item.error_type] += 1

if eval_item.is_match and eval_item.confidence < self.tolerance_threshold:
error_counts["low_confidence"] += 1

passed_count = sum(1 for r in results if r.passed)
total_count = len(samples)
accuracy = (passed_count / total_count * 100) if total_count > 0 else 0

return RegressionReport(
total_samples=total_count,
passed_samples=passed_count,
failed_samples=total_count - passed_count,
accuracy_percentage=accuracy,
error_counts=error_counts,
sample_results=results
)
Loading
Loading