diff --git a/.github/workflows/ocr-regression.yml b/.github/workflows/ocr-regression.yml new file mode 100644 index 00000000..95d690f3 --- /dev/null +++ b/.github/workflows/ocr-regression.yml @@ -0,0 +1,56 @@ +name: OCR Regression Test + +on: + push: + paths: + - 'app/ai-service/services/ocr.py' + - 'app/ai-service/services/preprocessing.py' + - 'app/ai-service/regression_harness/**' + branches: [ main, develop ] + pull_request: + paths: + - 'app/ai-service/services/ocr.py' + - 'app/ai-service/services/preprocessing.py' + - 'app/ai-service/regression_harness/**' + branches: [ main ] + workflow_dispatch: + +jobs: + regression: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install System Dependencies + run: | + sudo apt-get update + sudo apt-get install -y tesseract-ocr libtesseract-dev + + - name: Install Python Dependencies + working-directory: ./app/ai-service + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install Pillow pytesseract + + - name: Run OCR Regression Harness + working-directory: ./app/ai-service + run: | + export PYTHONPATH=$PYTHONPATH:. + python regression_harness/cli.py --output ocr_report.json + + - name: Upload Regression Report + if: always() + uses: actions/upload-artifact@v4 + with: + name: ocr-regression-report + path: app/ai-service/ocr_report.json + retention-days: 14 diff --git a/app/ai-service/regression_harness/README.md b/app/ai-service/regression_harness/README.md new file mode 100644 index 00000000..9aeb91de --- /dev/null +++ b/app/ai-service/regression_harness/README.md @@ -0,0 +1,66 @@ +# OCR Regression Harness + +The OCR Regression Harness is a tool designed to prevent extraction accuracy regressions by running OCR against a "golden dataset" of representative documents and comparing the results to ground truth values. + +## Directory Structure + +- `regression_harness/`: Main package for the harness. + - `cli.py`: Command line interface. + - `evaluator.py`: Evaluation logic. + - `models.py`: Data models for samples and reports. + - `dataset/`: Contains the golden dataset. + - `documents/`: Folder for raw images (PNG, JPG). + - `ground_truth.json`: The source of truth for expected values. + +## How to Run Locally + +1. Ensure you are in the `app/ai-service` directory. +2. Install dependencies: + ```bash + pip install -r requirements.txt + ``` +3. Run the harness: + ```bash + export PYTHONPATH=. + python regression_harness/cli.py + ``` + *Note: On Windows, use `set PYTHONPATH=.`* + +### CLI Options + +- `--dataset`: Path to ground truth JSON (default: `regression_harness/dataset/ground_truth.json`). +- `--output`: Path to save a machine-readable JSON report. +- `--threshold`: Minimum confidence threshold for fields (default: 0.8). + +## Adding New Golden Samples + +1. **Add the Image**: Place the document image in `regression_harness/dataset/documents/`. +2. **Update Ground Truth**: Edit `regression_harness/dataset/ground_truth.json` to add a new entry in the `samples` array. + +```json +{ + "id": "item_001", + "image_path": "documents/item_001.png", + "expected_fields": { + "name": "EXACT EXPECTED NAME", + "id_number": "EXPECTED ID" + }, + "metadata": { + "document_type": "passport", + "language": "en" + } +} +``` + +## Error Classification + +Failures are categorized into one of these groups: +- **Missing field**: A required field was not detected by the OCR service. +- **Incorrect value**: The field was detected but the value didn't match the ground truth. +- **Unexpected field**: OCR extracted a field that wasn't defined in the ground truth. +- **Low confidence**: The field matched but OCR engine's confidence was below the threshold. +- **Image not found**: The specified image path in ground truth is invalid. + +## CI Integration + +The harness runs automatically on every PR that touches OCR logic or the regression harness itself via `.github/workflows/ocr-regression.yml`. If the accuracy falls below 100% (or if any sample fails), the CI job will fail. diff --git a/app/ai-service/regression_harness/__init__.py b/app/ai-service/regression_harness/__init__.py new file mode 100644 index 00000000..7400b35c --- /dev/null +++ b/app/ai-service/regression_harness/__init__.py @@ -0,0 +1 @@ +# OCR Regression Harness package diff --git a/app/ai-service/regression_harness/cli.py b/app/ai-service/regression_harness/cli.py new file mode 100644 index 00000000..b70dd4ca --- /dev/null +++ b/app/ai-service/regression_harness/cli.py @@ -0,0 +1,88 @@ +import os +import json +import argparse +from typing import List +from regression_harness.models import EvaluationSample, BoundingBox +from regression_harness.evaluator import OCREvaluator + +def load_samples(ground_truth_path: str) -> List[EvaluationSample]: + with open(ground_truth_path, 'r') as f: + data = json.load(f) + + samples = [] + for s in data.get("samples", []): + bboxes = { + k: BoundingBox.from_dict(v) + for k, v in s.get("expected_bboxes", {}).items() + } + samples.append(EvaluationSample( + id=s["id"], + image_path=s["image_path"], + expected_fields=s["expected_fields"], + expected_bboxes=bboxes, + metadata=s.get("metadata", {}) + )) + return samples + +def print_summary(report): + print("\n" + "="*50) + print(" OCR REGRESSION HARNESS SUMMARY") + print("="*50) + print(f"Total Samples: {report.total_samples}") + print(f"Passed: {report.passed_samples}") + print(f"Failed: {report.failed_samples}") + print(f"Accuracy: {report.accuracy_percentage:.2f}%") + print("-" * 50) + print("Error breakdown:") + for err, count in report.error_counts.items(): + if count > 0: + print(f" {err:20}: {count}") + print("="*50 + "\n") + + if report.failed_samples > 0: + print("FAILED SAMPLES DETAILS:") + for res in report.sample_results: + if not res.passed: + print(f"\n[!] Sample ID: {res.sample_id}") + for eval in res.field_evaluations: + if not eval.is_match: + print(f" - {eval.field_name}: Expected '{eval.expected_value}', Got '{eval.actual_value}' (Error: {eval.error_type})") + print("\n" + "="*50) + +def main(): + parser = argparse.ArgumentParser(description="OCR Regression Harness") + parser.add_argument("--dataset", default="regression_harness/dataset/ground_truth.json", help="Path to ground truth JSON") + parser.add_argument("--output", help="Path to save JSON report") + parser.add_argument("--threshold", type=float, default=0.8, help="Confidence threshold") + + args = parser.parse_args() + + base_dir = os.path.dirname(os.path.abspath(__file__)) + # Adjust base_dir if it's currently inside regression_harness + if base_dir.endswith("regression_harness"): + base_dir = os.path.dirname(base_dir) + # We want base_dir to be app/ai-service + + gt_path = os.path.join(base_dir, args.dataset) + if not os.path.exists(gt_path): + print(f"Error: Dataset not found at {gt_path}") + return + + samples = load_samples(gt_path) + evaluator = OCREvaluator(tolerance_threshold=args.threshold) + + print(f"Running evaluation on {len(samples)} samples...") + report = evaluator.run_suite(samples, os.path.dirname(gt_path)) + + print_summary(report) + + if args.output: + with open(args.output, 'w') as f: + json.dump(report.to_dict(), f, indent=2) + print(f"Report saved to {args.output}") + + if report.failed_samples > 0: + exit(1) + +if __name__ == "__main__": + main() diff --git a/app/ai-service/regression_harness/dataset/__init__.py b/app/ai-service/regression_harness/dataset/__init__.py new file mode 100644 index 00000000..bbf80acf --- /dev/null +++ b/app/ai-service/regression_harness/dataset/__init__.py @@ -0,0 +1 @@ +# Dataset for OCR Regression Harness diff --git a/app/ai-service/regression_harness/dataset/documents/sample_001.png b/app/ai-service/regression_harness/dataset/documents/sample_001.png new file mode 100644 index 00000000..27815d44 Binary files /dev/null and b/app/ai-service/regression_harness/dataset/documents/sample_001.png differ diff --git a/app/ai-service/regression_harness/dataset/ground_truth.json b/app/ai-service/regression_harness/dataset/ground_truth.json new file mode 100644 index 00000000..6ee01932 --- /dev/null +++ b/app/ai-service/regression_harness/dataset/ground_truth.json @@ -0,0 +1,20 @@ +{ + "samples": [ + { + "id": "sample_001", + "image_path": "documents/sample_001.png", + "expected_fields": { + "name": "John Doe", + "date_of_birth": "15 Jan 1990", + "id_number": "AB123456" + }, + "expected_bboxes": { + "name": {"x": 100, "y": 200, "width": 300, "height": 50} + }, + "metadata": { + "document_type": "id_card", + "language": "en" + } + } + ] +} diff --git a/app/ai-service/regression_harness/evaluator.py b/app/ai-service/regression_harness/evaluator.py new file mode 100644 index 00000000..f2cea8ac --- /dev/null +++ b/app/ai-service/regression_harness/evaluator.py @@ -0,0 +1,154 @@ +import os +import sys +import time +import json +from PIL import Image +from typing import List, Dict, Any, Optional + +# Add the parent directory to path so we can import services +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from services.ocr import OCRService +from regression_harness.models import ( + EvaluationSample, SampleResult, FieldEvaluation, + RegressionReport, BoundingBox +) + +class OCREvaluator: + def __init__(self, tolerance_threshold: float = 0.8, iou_threshold: float = 0.5): + self.ocr_service = OCRService() + self.tolerance_threshold = tolerance_threshold + self.iou_threshold = iou_threshold + + def evaluate_sample(self, sample: EvaluationSample, base_dir: str) -> SampleResult: + image_path = os.path.join(base_dir, sample.image_path) + if not os.path.exists(image_path): + return SampleResult( + sample_id=sample.id, + field_evaluations=[ + FieldEvaluation( + field_name="all", + expected_value=None, + actual_value=None, + is_match=False, + error_type="image_not_found" + ) + ], + passed=False, + raw_text="", + processing_time_ms=0 + ) + + image = Image.open(image_path) + result = self.ocr_service.process_image(image) + + field_evals = [] + all_passed = True + + # Check expected fields + for field_name, expected_value in sample.expected_fields.items(): + actual_match = result.fields.get(field_name) + + if not actual_match: + field_evals.append(FieldEvaluation( + field_name=field_name, + expected_value=expected_value, + actual_value=None, + is_match=False, + error_type="missing_field" + )) + all_passed = False + else: + actual_value = actual_match.value + is_match = self._compare_values(expected_value, actual_value) + + error_type = None + if not is_match: + error_type = "incorrect_value" + all_passed = False + + # Note: Simplified bbox check as current OCRService doesn't return bboxes per field in OCRResult yet. + # If it did, we would use _calculate_iou here. + + field_evals.append(FieldEvaluation( + field_name=field_name, + expected_value=expected_value, + actual_value=actual_value, + is_match=is_match, + error_type=error_type, + confidence=actual_match.confidence + )) + + # Check for unexpected fields + for field_name in result.fields.keys(): + if field_name not in sample.expected_fields: + field_evals.append(FieldEvaluation( + field_name=field_name, + expected_value=None, + actual_value=result.fields[field_name].value, + is_match=False, + error_type="unexpected_field" + )) + + return SampleResult( + sample_id=sample.id, + field_evaluations=field_evals, + passed=all_passed, + raw_text=result.raw_text, + processing_time_ms=result.processing_time_ms + ) + + def _calculate_iou(self, box1: BoundingBox, box2: BoundingBox) -> float: + x1 = max(box1.x, box2.x) + y1 = max(box1.y, box2.y) + x2 = min(box1.x + box1.width, box2.x + box2.width) + y2 = min(box1.y + box1.height, box2.y + box2.height) + + intersection = max(0, x2 - x1) * max(0, y2 - y1) + area1 = box1.width * box1.height + area2 = box2.width * box2.height + union = area1 + area2 - intersection + + return intersection / union if union > 0 else 0 + + def _compare_values(self, expected: str, actual: str) -> bool: + if not expected or not actual: + return expected == actual + norm_expected = expected.strip().lower() + norm_actual = actual.strip().lower() + return norm_expected == norm_actual + + def run_suite(self, samples: List[EvaluationSample], base_dir: str) -> RegressionReport: + results = [] + error_counts = { + "missing_field": 0, + "incorrect_value": 0, + "unexpected_field": 0, + "image_not_found": 0, + "low_confidence": 0, + "bbox_mismatch": 0 + } + + for sample in samples: + res = self.evaluate_sample(sample, base_dir) + results.append(res) + + for eval_item in res.field_evaluations: + if eval_item.error_type in error_counts: + error_counts[eval_item.error_type] += 1 + + if eval_item.is_match and eval_item.confidence < self.tolerance_threshold: + error_counts["low_confidence"] += 1 + + passed_count = sum(1 for r in results if r.passed) + total_count = len(samples) + accuracy = (passed_count / total_count * 100) if total_count > 0 else 0 + + return RegressionReport( + total_samples=total_count, + passed_samples=passed_count, + failed_samples=total_count - passed_count, + accuracy_percentage=accuracy, + error_counts=error_counts, + sample_results=results + ) diff --git a/app/ai-service/regression_harness/models.py b/app/ai-service/regression_harness/models.py new file mode 100644 index 00000000..9fb1bcd6 --- /dev/null +++ b/app/ai-service/regression_harness/models.py @@ -0,0 +1,79 @@ +import json +import os +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Any + +@dataclass +class BoundingBox: + x: int + y: int + width: int + height: int + + def to_dict(self): + return {"x": self.x, "y": self.y, "width": self.width, "height": self.height} + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + +@dataclass +class EvaluationSample: + id: str + image_path: str + expected_fields: Dict[str, str] + expected_bboxes: Dict[str, BoundingBox] = field(default_factory=dict) + metadata: Dict[str, Any] = field(default_factory=dict) + +@dataclass +class FieldEvaluation: + field_name: str + expected_value: Optional[str] + actual_value: Optional[str] + is_match: bool + error_type: Optional[str] = None # 'missing_field', 'incorrect_value', 'unexpected_field' + confidence: float = 0.0 + +@dataclass +class SampleResult: + sample_id: str + field_evaluations: List[FieldEvaluation] + passed: bool + raw_text: str + processing_time_ms: int + +@dataclass +class RegressionReport: + total_samples: int + passed_samples: int + failed_samples: int + accuracy_percentage: float + error_counts: Dict[str, int] + sample_results: List[SampleResult] + + def to_dict(self): + return { + "summary": { + "total": self.total_samples, + "passed": self.passed_samples, + "failed": self.failed_samples, + "accuracy": self.accuracy_percentage, + "error_breakdown": self.error_counts + }, + "details": [ + { + "sample_id": r.sample_id, + "passed": r.passed, + "fields": [ + { + "name": f.field_name, + "expected": f.expected_value, + "actual": f.actual_value, + "match": f.is_match, + "error": f.error_type, + "confidence": f.confidence + } for f in r.field_evaluations + ] + } for r in self.sample_results + ] + } diff --git a/app/ai-service/services/ocr.py b/app/ai-service/services/ocr.py index caf7118d..3c32a4f2 100644 --- a/app/ai-service/services/ocr.py +++ b/app/ai-service/services/ocr.py @@ -25,8 +25,8 @@ class OCRResult: class FieldDetector: PATTERNS = { "name": [ - r"(?:Full\s+)?[Nn]ame[:\s]+\n?([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)", - r"(?:Full\s+)?[Nn]ame[:\s]+([A-Z]+(?:\s+[A-Z]+)+)", + r"(?:Full\s+)?[Nn]ame[:\s]+\n?([A-Z][a-z]+(?:[ \t]+[A-Z][a-z]+)+)", + r"(?:Full\s+)?[Nn]ame[:\s]+\n?([A-Z]+(?:[ \t]+[A-Z]+)+)", ], "date_of_birth": [ r"[Dd]ate\s+(?:of\s+)?[Bb]irth[:\s]*(\d{2}[-./]\d{2}[-./]\d{4})",