From 1b44eb6982b2eaa057850fc1550782dff479ce7b Mon Sep 17 00:00:00 2001 From: hyeokjun32 Date: Fri, 1 May 2026 21:09:44 +0900 Subject: [PATCH] feat: add optional coco metric backend --- README.md | 1 + docs/portfolio/final_validation_completion.md | 2 +- .../yolov8_coco_subset_evaluation.md | 2 + .../subset/yolov8_coco_subset_evaluation.html | 3 + .../subset/yolov8_coco_subset_evaluation.json | 7 +- .../subset/yolov8_coco_subset_evaluation.md | 3 + inferedgelab/commands/evaluate_detection.py | 17 ++ inferedgelab/core/detection_evaluator.py | 90 +++--- inferedgelab/evaluation/__init__.py | 11 + inferedgelab/evaluation/coco_eval.py | 18 ++ inferedgelab/evaluation/metrics.py | 260 ++++++++++++++++++ .../evaluation/pycocotools_backend.py | 24 ++ tests/test_evaluate_detection.py | 42 +++ tests/test_metric_backends.py | 51 ++++ tests/test_validation_demo_report.py | 2 + 15 files changed, 490 insertions(+), 43 deletions(-) create mode 100644 inferedgelab/evaluation/__init__.py create mode 100644 inferedgelab/evaluation/coco_eval.py create mode 100644 inferedgelab/evaluation/metrics.py create mode 100644 inferedgelab/evaluation/pycocotools_backend.py create mode 100644 tests/test_metric_backends.py diff --git a/README.md b/README.md index 1e908fe..55a3e33 100644 --- a/README.md +++ b/README.md @@ -197,6 +197,7 @@ CLI / API → Service Layer → Structured Result → Compare / Report InferEdgeLab treats model evaluation as a **contract/preset-based validation workflow**, not as a claim that any arbitrary model can be automatically scored without context. `evaluate-detection` now supports the `yolov8_coco` preset, optional `model_contract.json`, COCO annotations, YOLO txt labels, structural detection-output validation, and JSON/Markdown/HTML evaluation reports. +Metric evaluation defaults to the lightweight `--metric-backend simplified` path and can explicitly request `--metric-backend pycocotools` when the optional `pycocotools` package is installed. When annotations are not provided, accuracy is explicitly marked as `skipped` and the report records structural validation only. Planned presets such as `resnet_imagenet` and `custom_contract` keep future evaluation work scoped to explicit model contracts and dataset assumptions. diff --git a/docs/portfolio/final_validation_completion.md b/docs/portfolio/final_validation_completion.md index bb9892a..64fbd60 100644 --- a/docs/portfolio/final_validation_completion.md +++ b/docs/portfolio/final_validation_completion.md @@ -70,7 +70,7 @@ These are intentionally outside the current completion boundary: - file upload product flow - production frontend deployment - authentication, billing, and multi-user controls -- full COCO official evaluation +- making optional official COCO evaluation a required dependency - more presets such as `resnet_imagenet` ## Portfolio Message diff --git a/docs/portfolio/yolov8_coco_subset_evaluation.md b/docs/portfolio/yolov8_coco_subset_evaluation.md index 833a052..62cd504 100644 --- a/docs/portfolio/yolov8_coco_subset_evaluation.md +++ b/docs/portfolio/yolov8_coco_subset_evaluation.md @@ -20,6 +20,7 @@ It is not a full COCO benchmark and should not be presented as production model | Samples | 10 | | Ground-truth boxes | 89 | | Post-NMS detections checked | 51 | +| Metric backend | simplified | | mAP@50 | 0.1410 | | mAP@50-95 | 0.0873 | | Precision | 0.2941 | @@ -31,6 +32,7 @@ It is not a full COCO benchmark and should not be presented as production model ## Interpretation This demo proves that InferEdgeLab can load COCO-style annotations, run the YOLOv8 detection evaluator, compute simplified accuracy metrics, validate detection output structure, and emit JSON/Markdown/HTML reports. +The report records `metrics.backend = simplified`; `pycocotools` remains an optional explicit backend rather than a required dependency. The numbers are intentionally documented as a small subset result only. They are useful as portfolio workflow evidence, not as a claim of full COCO accuracy. diff --git a/examples/validation_demo/subset/yolov8_coco_subset_evaluation.html b/examples/validation_demo/subset/yolov8_coco_subset_evaluation.html index f43b2d4..60ea81d 100644 --- a/examples/validation_demo/subset/yolov8_coco_subset_evaluation.html +++ b/examples/validation_demo/subset/yolov8_coco_subset_evaluation.html @@ -11,15 +11,18 @@ - deployment signal: `review` ## Metrics +- backend: `simplified` - map50: `0.14097840361885305` - map50_95: `0.08728567780534073` - f1_score: `0.21428571428571427` - precision: `0.29411764705882354` - recall: `0.16853932584269662` +- note: `lightweight simplified mAP50 implementation` ## Notes - Detection evaluation uses image directory traversal. - YOLOv8 postprocessing supports single-output and split boxes/scores output layouts. - Accuracy uses YOLO txt labels or COCO annotations when provided. - When annotations are missing, InferEdge records accuracy_skipped and structural validation only. +- Accuracy metrics backend: simplified lightweight mAP50. diff --git a/examples/validation_demo/subset/yolov8_coco_subset_evaluation.json b/examples/validation_demo/subset/yolov8_coco_subset_evaluation.json index 0ca59b0..01dbbcf 100644 --- a/examples/validation_demo/subset/yolov8_coco_subset_evaluation.json +++ b/examples/validation_demo/subset/yolov8_coco_subset_evaluation.json @@ -260,11 +260,13 @@ "accuracy": { "status": "evaluated", "metrics": { + "backend": "simplified", "map50": 0.14097840361885305, "map50_95": 0.08728567780534073, "f1_score": 0.21428571428571427, "precision": 0.29411764705882354, - "recall": 0.16853932584269662 + "recall": 0.16853932584269662, + "note": "lightweight simplified mAP50 implementation" }, "reason": null }, @@ -307,6 +309,7 @@ "Detection evaluation uses image directory traversal.", "YOLOv8 postprocessing supports single-output and split boxes/scores output layouts.", "Accuracy uses YOLO txt labels or COCO annotations when provided.", - "When annotations are missing, InferEdge records accuracy_skipped and structural validation only." + "When annotations are missing, InferEdge records accuracy_skipped and structural validation only.", + "Accuracy metrics backend: simplified lightweight mAP50." ] } diff --git a/examples/validation_demo/subset/yolov8_coco_subset_evaluation.md b/examples/validation_demo/subset/yolov8_coco_subset_evaluation.md index 0e5bdf6..2e0dfa0 100644 --- a/examples/validation_demo/subset/yolov8_coco_subset_evaluation.md +++ b/examples/validation_demo/subset/yolov8_coco_subset_evaluation.md @@ -10,14 +10,17 @@ - deployment signal: `review` ## Metrics +- backend: `simplified` - map50: `0.14097840361885305` - map50_95: `0.08728567780534073` - f1_score: `0.21428571428571427` - precision: `0.29411764705882354` - recall: `0.16853932584269662` +- note: `lightweight simplified mAP50 implementation` ## Notes - Detection evaluation uses image directory traversal. - YOLOv8 postprocessing supports single-output and split boxes/scores output layouts. - Accuracy uses YOLO txt labels or COCO annotations when provided. - When annotations are missing, InferEdge records accuracy_skipped and structural validation only. +- Accuracy metrics backend: simplified lightweight mAP50. diff --git a/inferedgelab/commands/evaluate_detection.py b/inferedgelab/commands/evaluate_detection.py index 73b0379..51dc394 100644 --- a/inferedgelab/commands/evaluate_detection.py +++ b/inferedgelab/commands/evaluate_detection.py @@ -16,6 +16,9 @@ supported_engines, supported_engines_display, ) +from inferedgelab.evaluation.metrics import MetricBackendError +from inferedgelab.evaluation.metrics import get_metric_backend +from inferedgelab.evaluation.metrics import supported_metric_backends from inferedgelab.result.saver import save_result from inferedgelab.result.schema import BenchmarkResult from inferedgelab.utils.system_info import collect_system_snapshot @@ -48,6 +51,11 @@ def evaluate_detection_cmd( label_dir: str = typer.Option("", "--label-dir", help="YOLO txt 라벨 디렉토리"), coco_annotations: str = typer.Option("", "--coco-annotations", help="COCO annotation JSON 경로"), preset: str = typer.Option("yolov8_coco", "--preset", help="Validation preset 이름"), + metric_backend: str = typer.Option( + "simplified", + "--metric-backend", + help="Metric backend: simplified or pycocotools", + ), model_contract: str = typer.Option("", "--model-contract", help="model_contract.json 경로"), num_classes: int = typer.Option(1, "--num-classes", help="클래스 수"), precision: str = typer.Option("fp16", "--precision", help="precision 메타데이터 (fp32, fp16, int8)"), @@ -86,11 +94,17 @@ def evaluate_detection_cmd( raise typer.BadParameter("--num-classes must be >= 1") coco_annotations = _option_string(coco_annotations) preset = _option_string(preset, "yolov8_coco") + metric_backend = _option_string(metric_backend, "simplified").strip().lower() model_contract = _option_string(model_contract) report_json = _option_string(report_json) report_md = _option_string(report_md) report_html = _option_string(report_html) preset = preset.strip().lower() + try: + get_metric_backend(metric_backend).ensure_available() + except MetricBackendError as exc: + supported = ", ".join(supported_metric_backends()) + raise typer.BadParameter(f"{exc} Supported metric backends: {supported}") from exc try: preset_def = get_preset(preset) contract = ( @@ -121,6 +135,7 @@ def evaluate_detection_cmd( use_rgb=rgb, input_size=640, debug_samples=debug_samples, + metric_backend=metric_backend, ) except RuntimeError as exc: _exit_with_runtime_error(str(exc)) @@ -156,6 +171,7 @@ def evaluate_detection_cmd( "model_contract_path": model_contract.strip() or None, "coco_annotations": coco_annotations.strip() or None, "num_classes": num_classes, + "metric_backend": metric_backend, }, accuracy=accuracy_payload, extra={ @@ -190,6 +206,7 @@ def evaluate_detection_cmd( rprint(f"Images : {image_dir}") rprint(f"Labels : {label_dir or '(not provided)'}") rprint(f"COCO annotations: {coco_annotations or '(not provided)'}") + rprint(f"Metric backend : {eval_result.metrics.get('backend', metric_backend)}") rprint(f"Samples : {eval_result.sample_count}") rprint(f"Accuracy status : {eval_result.extra.get('accuracy_status', 'evaluated')}") if eval_result.extra.get("accuracy_status") == "skipped": diff --git a/inferedgelab/core/detection_evaluator.py b/inferedgelab/core/detection_evaluator.py index 44688dc..9715a14 100644 --- a/inferedgelab/core/detection_evaluator.py +++ b/inferedgelab/core/detection_evaluator.py @@ -10,6 +10,8 @@ from inferedgelab.engines.base import EngineModelIO from inferedgelab.engines.registry import create_engine, normalize_engine_name +from inferedgelab.evaluation.metrics import MetricBackendError +from inferedgelab.evaluation.metrics import get_metric_backend from inferedgelab.validation.coco import load_coco_ground_truths from inferedgelab.validation.structural import validate_detection_structure @@ -33,7 +35,7 @@ class DetectionEvalResult: engine: str device: str sample_count: int - metrics: Dict[str, float] + metrics: Dict[str, Any] notes: List[str] model_input: Dict[str, Any] actual_input_shape: List[int] @@ -810,8 +812,11 @@ def evaluate_detection_engine( use_rgb: bool = True, input_size: int = 640, debug_samples: int = 0, + metric_backend: str = "simplified", ) -> DetectionEvalResult: engine_name = normalize_engine_name(engine_name) + metric_backend_impl = get_metric_backend(metric_backend) + metric_backend_impl.ensure_available() engine = create_engine(engine_name) load_kwargs: dict[str, Any] = {} @@ -912,53 +917,55 @@ def evaluate_detection_engine( ) if accuracy_status == "evaluated": - precision, recall, f1_score = compute_precision_recall_f1( - predictions_by_image, - ground_truths_by_image, - num_classes=num_classes, - iou_threshold=iou_threshold, - ) - map50 = compute_average_precision( - predictions_by_image, - ground_truths_by_image, - num_classes=num_classes, - iou_threshold=0.5, - ) - map_thresholds = np.arange(0.5, 1.0, 0.05) - map50_95 = float( - np.mean( - [ - compute_average_precision( - predictions_by_image, - ground_truths_by_image, - num_classes=num_classes, - iou_threshold=float(threshold), - ) - for threshold in map_thresholds - ] + try: + backend_result = metric_backend_impl.evaluate( + predictions_by_image=predictions_by_image, + ground_truths_by_image=ground_truths_by_image, + num_classes=num_classes, + iou_threshold=iou_threshold, + average_precision_fn=compute_average_precision, + precision_recall_fn=compute_precision_recall_f1, + mean_fn=lambda values: float(np.mean(values)), ) - ) + except MetricBackendError: + raise + metrics = backend_result.metrics + metric_notes = backend_result.notes + metric_warnings = backend_result.warnings else: - precision = recall = f1_score = map50 = map50_95 = 0.0 + metrics = { + "backend": metric_backend_impl.name, + "map50": 0.0, + "map50_95": 0.0, + "f1_score": 0.0, + "precision": 0.0, + "recall": 0.0, + "note": ( + "lightweight simplified mAP50 implementation" + if metric_backend_impl.name == "simplified" + else "accuracy skipped before metric backend execution" + ), + } + metric_notes = [f"Accuracy metrics backend: {metric_backend_impl.name}."] + metric_warnings = [] + + notes = [ + "Detection evaluation uses image directory traversal.", + "YOLOv8 postprocessing supports single-output and split boxes/scores output layouts.", + "Accuracy uses YOLO txt labels or COCO annotations when provided.", + "When annotations are missing, InferEdge records accuracy_skipped and structural validation only.", + *metric_notes, + ] + if metric_warnings: + notes.extend(f"Metric warning: {warning}" for warning in metric_warnings) return DetectionEvalResult( task="detection", engine=engine.name, device=engine.device, sample_count=len(image_files), - metrics={ - "map50": map50, - "map50_95": map50_95, - "f1_score": f1_score, - "precision": precision, - "recall": recall, - }, - notes=[ - "Detection evaluation uses image directory traversal.", - "YOLOv8 postprocessing supports single-output and split boxes/scores output layouts.", - "Accuracy uses YOLO txt labels or COCO annotations when provided.", - "When annotations are missing, InferEdge records accuracy_skipped and structural validation only.", - ], + metrics=metrics, + notes=notes, model_input={ "name": model_input.name, "dtype": str(model_input.dtype), @@ -979,6 +986,7 @@ def evaluate_detection_engine( "input_size": input_size, "rgb": use_rgb, "num_classes": num_classes, + "metric_backend": metric_backend_impl.name, }, extra={ "engine_path": engine_path, @@ -991,6 +999,8 @@ def evaluate_detection_engine( "accuracy_status": accuracy_status, "accuracy_skip_reason": accuracy_skip_reason, "structural_validation": structural_validation, + "metric_backend": metric_backend_impl.name, + "metric_warnings": metric_warnings, }, ) finally: diff --git a/inferedgelab/evaluation/__init__.py b/inferedgelab/evaluation/__init__.py new file mode 100644 index 0000000..52cde34 --- /dev/null +++ b/inferedgelab/evaluation/__init__.py @@ -0,0 +1,11 @@ +"""Metric backend helpers for InferEdge evaluation.""" + +from inferedgelab.evaluation.metrics import MetricBackendError +from inferedgelab.evaluation.metrics import get_metric_backend +from inferedgelab.evaluation.metrics import supported_metric_backends + +__all__ = [ + "MetricBackendError", + "get_metric_backend", + "supported_metric_backends", +] diff --git a/inferedgelab/evaluation/coco_eval.py b/inferedgelab/evaluation/coco_eval.py new file mode 100644 index 0000000..54e5c49 --- /dev/null +++ b/inferedgelab/evaluation/coco_eval.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +from typing import Any + + +def build_metric_payload( + *, + backend: str, + metrics: dict[str, Any], + note: str | None = None, + warnings: list[str] | None = None, +) -> dict[str, Any]: + payload = {"backend": backend, **metrics} + if note: + payload["note"] = note + if warnings: + payload["warnings"] = list(warnings) + return payload diff --git a/inferedgelab/evaluation/metrics.py b/inferedgelab/evaluation/metrics.py new file mode 100644 index 0000000..3b4125f --- /dev/null +++ b/inferedgelab/evaluation/metrics.py @@ -0,0 +1,260 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Callable, Protocol, Sequence + +import numpy as np + +from inferedgelab.evaluation.coco_eval import build_metric_payload +from inferedgelab.evaluation.pycocotools_backend import PycocotoolsUnavailableError +from inferedgelab.evaluation.pycocotools_backend import require_pycocotools + + +class MetricBackendError(RuntimeError): + """Raised when a metric backend cannot evaluate the requested payload.""" + + +class AveragePrecisionFn(Protocol): + def __call__( + self, + predictions_by_image: Sequence[Any], + ground_truths_by_image: Sequence[Any], + *, + num_classes: int, + iou_threshold: float, + ) -> float: ... + + +class PrecisionRecallFn(Protocol): + def __call__( + self, + predictions_by_image: Sequence[Any], + ground_truths_by_image: Sequence[Any], + *, + num_classes: int, + iou_threshold: float, + ) -> tuple[float, float, float]: ... + + +@dataclass(frozen=True) +class MetricBackendResult: + metrics: dict[str, Any] + notes: list[str] + warnings: list[str] + + +class MetricBackend(Protocol): + name: str + + def ensure_available(self) -> None: ... + + def evaluate( + self, + *, + predictions_by_image: Sequence[Any], + ground_truths_by_image: Sequence[Any], + num_classes: int, + iou_threshold: float, + average_precision_fn: AveragePrecisionFn, + precision_recall_fn: PrecisionRecallFn, + mean_fn: Callable[[list[float]], float], + ) -> MetricBackendResult: ... + + +class SimplifiedMap50Backend: + name = "simplified" + + def ensure_available(self) -> None: + return None + + def evaluate( + self, + *, + predictions_by_image: Sequence[Any], + ground_truths_by_image: Sequence[Any], + num_classes: int, + iou_threshold: float, + average_precision_fn: AveragePrecisionFn, + precision_recall_fn: PrecisionRecallFn, + mean_fn: Callable[[list[float]], float], + ) -> MetricBackendResult: + precision, recall, f1_score = precision_recall_fn( + predictions_by_image, + ground_truths_by_image, + num_classes=num_classes, + iou_threshold=iou_threshold, + ) + map50 = average_precision_fn( + predictions_by_image, + ground_truths_by_image, + num_classes=num_classes, + iou_threshold=0.5, + ) + thresholds = [round(0.5 + 0.05 * index, 2) for index in range(10)] + map50_95 = float( + mean_fn( + [ + average_precision_fn( + predictions_by_image, + ground_truths_by_image, + num_classes=num_classes, + iou_threshold=float(threshold), + ) + for threshold in thresholds + ] + ) + ) + return MetricBackendResult( + metrics=build_metric_payload( + backend=self.name, + metrics={ + "map50": float(map50), + "map50_95": map50_95, + "f1_score": float(f1_score), + "precision": float(precision), + "recall": float(recall), + }, + note="lightweight simplified mAP50 implementation", + ), + notes=["Accuracy metrics backend: simplified lightweight mAP50."], + warnings=[], + ) + + +class PycocotoolsBackend: + name = "pycocotools" + + def ensure_available(self) -> None: + try: + require_pycocotools() + except PycocotoolsUnavailableError as exc: + raise MetricBackendError(str(exc)) from exc + + def evaluate( + self, + *, + predictions_by_image: Sequence[Any], + ground_truths_by_image: Sequence[Any], + num_classes: int, + iou_threshold: float, + average_precision_fn: AveragePrecisionFn, + precision_recall_fn: PrecisionRecallFn, + mean_fn: Callable[[list[float]], float], + ) -> MetricBackendResult: + modules = require_pycocotools() + coco_cls = modules["COCO"] + cocoeval_cls = modules["COCOeval"] + + images = [{"id": index + 1} for index, _ in enumerate(ground_truths_by_image)] + categories = [{"id": class_id, "name": str(class_id)} for class_id in range(num_classes)] + annotations: list[dict[str, Any]] = [] + detections: list[dict[str, Any]] = [] + + annotation_id = 1 + for image_index, ground_truths in enumerate(ground_truths_by_image, start=1): + for ground_truth in ground_truths: + box = _xyxy_to_xywh(getattr(ground_truth, "box")) + annotations.append( + { + "id": annotation_id, + "image_id": image_index, + "category_id": int(getattr(ground_truth, "class_id")), + "bbox": box, + "area": box[2] * box[3], + "iscrowd": 0, + } + ) + annotation_id += 1 + + for image_index, predictions in enumerate(predictions_by_image, start=1): + for prediction in predictions: + detections.append( + { + "image_id": image_index, + "category_id": int(getattr(prediction, "class_id")), + "bbox": _xyxy_to_xywh(getattr(prediction, "box")), + "score": float(getattr(prediction, "confidence")), + } + ) + + if not annotations: + return MetricBackendResult( + metrics=build_metric_payload( + backend=self.name, + metrics={ + "map50": 0.0, + "map50_95": 0.0, + "f1_score": 0.0, + "precision": 0.0, + "recall": 0.0, + }, + warnings=["No COCO annotations were available for pycocotools evaluation."], + ), + notes=["Accuracy metrics backend: pycocotools."], + warnings=["No COCO annotations were available for pycocotools evaluation."], + ) + + coco_gt = coco_cls() + coco_gt.dataset = { + "images": images, + "annotations": annotations, + "categories": categories, + "info": {}, + "licenses": [], + } + coco_gt.createIndex() + + coco_dt = coco_gt.loadRes(detections) if detections else coco_gt.loadRes([]) + coco_eval = cocoeval_cls(coco_gt, coco_dt, "bbox") + coco_eval.params.catIds = [item["id"] for item in categories] + coco_eval.params.imgIds = [item["id"] for item in images] + coco_eval.evaluate() + coco_eval.accumulate() + + precision_values = coco_eval.eval["precision"] + recall_values = coco_eval.eval["recall"] + valid_precision = precision_values[precision_values > -1] + valid_recall = recall_values[recall_values > -1] + map50_95 = float(np.mean(valid_precision)) if valid_precision.size else 0.0 + map50_precision = precision_values[0] + valid_map50_precision = map50_precision[map50_precision > -1] + map50 = float(np.mean(valid_map50_precision)) if valid_map50_precision.size else 0.0 + recall = float(np.mean(valid_recall)) if valid_recall.size else 0.0 + precision = map50 + f1_score = 0.0 + if precision + recall > 0: + f1_score = 2.0 * precision * recall / (precision + recall) + + return MetricBackendResult( + metrics=build_metric_payload( + backend=self.name, + metrics={ + "map50": map50, + "map50_95": map50_95, + "f1_score": float(f1_score), + "precision": precision, + "recall": recall, + }, + ), + notes=["Accuracy metrics backend: pycocotools official COCO evaluator."], + warnings=[], + ) + + +def supported_metric_backends() -> tuple[str, ...]: + return ("simplified", "pycocotools") + + +def get_metric_backend(name: str) -> MetricBackend: + normalized = name.strip().lower() + if normalized == "simplified": + return SimplifiedMap50Backend() + if normalized == "pycocotools": + return PycocotoolsBackend() + supported = ", ".join(supported_metric_backends()) + raise MetricBackendError(f"unsupported metric backend: {name}. Supported backends: {supported}") + + +def _xyxy_to_xywh(box: Sequence[float]) -> list[float]: + x1, y1, x2, y2 = [float(value) for value in box] + return [x1, y1, max(0.0, x2 - x1), max(0.0, y2 - y1)] diff --git a/inferedgelab/evaluation/pycocotools_backend.py b/inferedgelab/evaluation/pycocotools_backend.py new file mode 100644 index 0000000..db18fb7 --- /dev/null +++ b/inferedgelab/evaluation/pycocotools_backend.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +import importlib +from typing import Any + + +class PycocotoolsUnavailableError(RuntimeError): + """Raised when the optional pycocotools backend is requested but unavailable.""" + + +def require_pycocotools() -> dict[str, Any]: + try: + coco_module = importlib.import_module("pycocotools.coco") + cocoeval_module = importlib.import_module("pycocotools.cocoeval") + except ImportError as exc: + raise PycocotoolsUnavailableError( + "pycocotools backend requested but pycocotools is not installed. " + "Hint: pip install pycocotools" + ) from exc + + return { + "COCO": getattr(coco_module, "COCO"), + "COCOeval": getattr(cocoeval_module, "COCOeval"), + } diff --git a/tests/test_evaluate_detection.py b/tests/test_evaluate_detection.py index 9adc24e..52a4e83 100644 --- a/tests/test_evaluate_detection.py +++ b/tests/test_evaluate_detection.py @@ -169,11 +169,13 @@ def test_accuracy_payload_save_keeps_task_and_metrics_structure(tmp_path): device="gpu", sample_count=2, metrics={ + "backend": "simplified", "map50": 0.9, "map50_95": 0.7, "f1_score": 0.8, "precision": 0.85, "recall": 0.75, + "note": "lightweight simplified mAP50 implementation", }, notes=[], model_input={"name": "images", "dtype": "float32", "shape": [1, 3, 640, 640]}, @@ -195,6 +197,7 @@ def test_accuracy_payload_save_keeps_task_and_metrics_structure(tmp_path): saved = json.loads(out_json.read_text(encoding="utf-8")) assert saved["task"] == "detection" + assert saved["metrics"]["backend"] == "simplified" assert saved["metrics"]["map50"] == pytest.approx(0.9) assert saved["metrics"]["f1_score"] == pytest.approx(0.8) assert saved["dataset"]["sample_count"] == 2 @@ -213,11 +216,13 @@ def fake_evaluate_detection_engine(**kwargs): device="gpu", sample_count=3, metrics={ + "backend": "simplified", "map50": 0.7791, "map50_95": 0.5512, "f1_score": 0.8180, "precision": 0.7950, "recall": 0.8424, + "note": "lightweight simplified mAP50 implementation", }, notes=[], model_input={"name": "images", "dtype": "float16", "shape": [1, 3, 640, 640]}, @@ -266,11 +271,14 @@ def fake_save_result(result, out_dir="results"): payload = json.loads(out_json.read_text(encoding="utf-8")) assert payload["task"] == "detection" + assert payload["metrics"]["backend"] == "simplified" assert payload["metrics"]["map50"] == pytest.approx(0.7791) assert captured["result"].accuracy["task"] == "detection" assert captured["result"].accuracy["metrics"]["map50"] == pytest.approx(0.7791) assert captured["result"].run_config["mode"] == "evaluate-detection" + assert captured["result"].run_config["metric_backend"] == "simplified" assert captured["engine_kwargs"]["debug_samples"] == 0 + assert captured["engine_kwargs"]["metric_backend"] == "simplified" def test_evaluate_detection_command_writes_contract_evaluation_report(tmp_path, monkeypatch): @@ -286,11 +294,13 @@ def fake_evaluate_detection_engine(**kwargs): device="cpu", sample_count=1, metrics={ + "backend": "simplified", "map50": 0.0, "map50_95": 0.0, "f1_score": 0.0, "precision": 0.0, "recall": 0.0, + "note": "lightweight simplified mAP50 implementation", }, notes=["structural validation only"], model_input={"name": "images", "dtype": "float32", "shape": [1, 3, 640, 640]}, @@ -341,8 +351,10 @@ def fake_evaluate_detection_engine(**kwargs): report = json.loads(report_json.read_text(encoding="utf-8")) assert captured["engine_kwargs"]["label_dir"] is None assert captured["engine_kwargs"]["coco_annotations"] is None + assert captured["engine_kwargs"]["metric_backend"] == "simplified" assert report["model_contract"]["preset"] == "yolov8_coco" assert report["accuracy"]["status"] == "skipped" + assert report["accuracy"]["metrics"]["backend"] == "simplified" assert "accuracy skipped reason" in report_md.read_text(encoding="utf-8") @@ -441,6 +453,36 @@ def test_evaluate_detection_help_shows_debug_samples_option(): assert "--model-contract" in result.stdout assert "--preset" in result.stdout assert "--coco-annotations" in result.stdout + assert "--metric-backend" in result.stdout + + +def test_evaluate_detection_command_rejects_unsupported_metric_backend(): + from inferedgelab.commands import evaluate_detection + + with pytest.raises(Exception, match="unsupported metric backend"): + evaluate_detection.evaluate_detection_cmd( + model_path="models/onnx/yolov8n.onnx", + engine="onnxruntime", + engine_path="", + image_dir="images", + label_dir="labels", + metric_backend="made_up_backend", + preset="yolov8_coco", + model_contract="", + num_classes=1, + precision="fp32", + conf_threshold=0.2, + nms_threshold=0.45, + iou_threshold=0.5, + rgb=True, + debug_samples=0, + out_json="", + report_json="", + report_md="", + report_html="", + out_dir="results", + save_structured_result=False, + ) def test_cli_help_registers_evaluate_detection_command(): diff --git a/tests/test_metric_backends.py b/tests/test_metric_backends.py new file mode 100644 index 0000000..7b11a3e --- /dev/null +++ b/tests/test_metric_backends.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import importlib + +import pytest + +from inferedgelab.evaluation.metrics import MetricBackendError +from inferedgelab.evaluation.metrics import get_metric_backend +from inferedgelab.evaluation.metrics import supported_metric_backends +from inferedgelab.evaluation.pycocotools_backend import require_pycocotools + + +def test_simplified_metric_backend_records_backend_and_note(): + backend = get_metric_backend("simplified") + + result = backend.evaluate( + predictions_by_image=[], + ground_truths_by_image=[], + num_classes=1, + iou_threshold=0.5, + average_precision_fn=lambda *args, **kwargs: 0.68, + precision_recall_fn=lambda *args, **kwargs: (0.7, 0.6, 0.646), + mean_fn=lambda values: sum(values) / len(values), + ) + + assert result.metrics["backend"] == "simplified" + assert result.metrics["map50"] == pytest.approx(0.68) + assert result.metrics["precision"] == pytest.approx(0.7) + assert result.metrics["recall"] == pytest.approx(0.6) + assert result.metrics["note"] == "lightweight simplified mAP50 implementation" + + +def test_unsupported_metric_backend_fails_clearly(): + with pytest.raises(MetricBackendError, match="unsupported metric backend"): + get_metric_backend("made_up_backend") + + +def test_pycocotools_backend_requested_without_dependency_fails_clearly(monkeypatch): + def fake_import_module(name: str): + if name.startswith("pycocotools"): + raise ImportError("missing pycocotools") + return importlib.import_module(name) + + monkeypatch.setattr(importlib, "import_module", fake_import_module) + + with pytest.raises(RuntimeError, match="pycocotools backend requested but pycocotools is not installed"): + require_pycocotools() + + +def test_supported_metric_backends_include_simplified_and_pycocotools(): + assert supported_metric_backends() == ("simplified", "pycocotools") diff --git a/tests/test_validation_demo_report.py b/tests/test_validation_demo_report.py index 4591f2d..ed39aa0 100644 --- a/tests/test_validation_demo_report.py +++ b/tests/test_validation_demo_report.py @@ -17,6 +17,8 @@ def test_yolov8_coco_subset_demo_report_contains_evaluated_accuracy(): assert report["preset"]["name"] == "yolov8_coco" assert report["runtime_result"]["sample_count"] == 10 assert report["accuracy"]["status"] == "evaluated" + assert report["accuracy"]["metrics"]["backend"] == "simplified" + assert report["accuracy"]["metrics"]["note"] == "lightweight simplified mAP50 implementation" assert round(report["accuracy"]["metrics"]["map50"], 4) == 0.141 assert round(report["accuracy"]["metrics"]["precision"], 4) == 0.2941 assert round(report["accuracy"]["metrics"]["recall"], 4) == 0.1685