From 1b44eb6982b2eaa057850fc1550782dff479ce7b Mon Sep 17 00:00:00 2001
From: hyeokjun32 <ksjm0417@naver.com>
Date: Fri, 1 May 2026 21:09:44 +0900
Subject: [PATCH] feat: add optional coco metric backend

---
 README.md                                     |   1 +
 docs/portfolio/final_validation_completion.md |   2 +-
 .../yolov8_coco_subset_evaluation.md          |   2 +
 .../subset/yolov8_coco_subset_evaluation.html |   3 +
 .../subset/yolov8_coco_subset_evaluation.json |   7 +-
 .../subset/yolov8_coco_subset_evaluation.md   |   3 +
 inferedgelab/commands/evaluate_detection.py   |  17 ++
 inferedgelab/core/detection_evaluator.py      |  90 +++---
 inferedgelab/evaluation/__init__.py           |  11 +
 inferedgelab/evaluation/coco_eval.py          |  18 ++
 inferedgelab/evaluation/metrics.py            | 260 ++++++++++++++++++
 .../evaluation/pycocotools_backend.py         |  24 ++
 tests/test_evaluate_detection.py              |  42 +++
 tests/test_metric_backends.py                 |  51 ++++
 tests/test_validation_demo_report.py          |   2 +
 15 files changed, 490 insertions(+), 43 deletions(-)
 create mode 100644 inferedgelab/evaluation/__init__.py
 create mode 100644 inferedgelab/evaluation/coco_eval.py
 create mode 100644 inferedgelab/evaluation/metrics.py
 create mode 100644 inferedgelab/evaluation/pycocotools_backend.py
 create mode 100644 tests/test_metric_backends.py

diff --git a/README.md b/README.md
index 1e908fe..55a3e33 100644
--- a/README.md
+++ b/README.md
@@ -197,6 +197,7 @@ CLI / API → Service Layer → Structured Result → Compare / Report
 
 InferEdgeLab treats model evaluation as a **contract/preset-based validation workflow**, not as a claim that any arbitrary model can be automatically scored without context.
 `evaluate-detection` now supports the `yolov8_coco` preset, optional `model_contract.json`, COCO annotations, YOLO txt labels, structural detection-output validation, and JSON/Markdown/HTML evaluation reports.
+Metric evaluation defaults to the lightweight `--metric-backend simplified` path and can explicitly request `--metric-backend pycocotools` when the optional `pycocotools` package is installed.
 When annotations are not provided, accuracy is explicitly marked as `skipped` and the report records structural validation only.
 
 Planned presets such as `resnet_imagenet` and `custom_contract` keep future evaluation work scoped to explicit model contracts and dataset assumptions.
diff --git a/docs/portfolio/final_validation_completion.md b/docs/portfolio/final_validation_completion.md
index bb9892a..64fbd60 100644
--- a/docs/portfolio/final_validation_completion.md
+++ b/docs/portfolio/final_validation_completion.md
@@ -70,7 +70,7 @@ These are intentionally outside the current completion boundary:
 - file upload product flow
 - production frontend deployment
 - authentication, billing, and multi-user controls
-- full COCO official evaluation
+- making optional official COCO evaluation a required dependency
 - more presets such as `resnet_imagenet`
 
 ## Portfolio Message
diff --git a/docs/portfolio/yolov8_coco_subset_evaluation.md b/docs/portfolio/yolov8_coco_subset_evaluation.md
index 833a052..62cd504 100644
--- a/docs/portfolio/yolov8_coco_subset_evaluation.md
+++ b/docs/portfolio/yolov8_coco_subset_evaluation.md
@@ -20,6 +20,7 @@ It is not a full COCO benchmark and should not be presented as production model
 | Samples | 10 |
 | Ground-truth boxes | 89 |
 | Post-NMS detections checked | 51 |
+| Metric backend | simplified |
 | mAP@50 | 0.1410 |
 | mAP@50-95 | 0.0873 |
 | Precision | 0.2941 |
@@ -31,6 +32,7 @@ It is not a full COCO benchmark and should not be presented as production model
 ## Interpretation
 
 This demo proves that InferEdgeLab can load COCO-style annotations, run the YOLOv8 detection evaluator, compute simplified accuracy metrics, validate detection output structure, and emit JSON/Markdown/HTML reports.
+The report records `metrics.backend = simplified`; `pycocotools` remains an optional explicit backend rather than a required dependency.
 The numbers are intentionally documented as a small subset result only.
 They are useful as portfolio workflow evidence, not as a claim of full COCO accuracy.
 
diff --git a/examples/validation_demo/subset/yolov8_coco_subset_evaluation.html b/examples/validation_demo/subset/yolov8_coco_subset_evaluation.html
index f43b2d4..60ea81d 100644
--- a/examples/validation_demo/subset/yolov8_coco_subset_evaluation.html
+++ b/examples/validation_demo/subset/yolov8_coco_subset_evaluation.html
@@ -11,15 +11,18 @@
 - deployment signal: `review`
 
 ## Metrics
+- backend: `simplified`
 - map50: `0.14097840361885305`
 - map50_95: `0.08728567780534073`
 - f1_score: `0.21428571428571427`
 - precision: `0.29411764705882354`
 - recall: `0.16853932584269662`
+- note: `lightweight simplified mAP50 implementation`
 
 ## Notes
 - Detection evaluation uses image directory traversal.
 - YOLOv8 postprocessing supports single-output and split boxes/scores output layouts.
 - Accuracy uses YOLO txt labels or COCO annotations when provided.
 - When annotations are missing, InferEdge records accuracy_skipped and structural validation only.
+- Accuracy metrics backend: simplified lightweight mAP50.
 </pre></body></html>
diff --git a/examples/validation_demo/subset/yolov8_coco_subset_evaluation.json b/examples/validation_demo/subset/yolov8_coco_subset_evaluation.json
index 0ca59b0..01dbbcf 100644
--- a/examples/validation_demo/subset/yolov8_coco_subset_evaluation.json
+++ b/examples/validation_demo/subset/yolov8_coco_subset_evaluation.json
@@ -260,11 +260,13 @@
   "accuracy": {
     "status": "evaluated",
     "metrics": {
+      "backend": "simplified",
       "map50": 0.14097840361885305,
       "map50_95": 0.08728567780534073,
       "f1_score": 0.21428571428571427,
       "precision": 0.29411764705882354,
-      "recall": 0.16853932584269662
+      "recall": 0.16853932584269662,
+      "note": "lightweight simplified mAP50 implementation"
     },
     "reason": null
   },
@@ -307,6 +309,7 @@
     "Detection evaluation uses image directory traversal.",
     "YOLOv8 postprocessing supports single-output and split boxes/scores output layouts.",
     "Accuracy uses YOLO txt labels or COCO annotations when provided.",
-    "When annotations are missing, InferEdge records accuracy_skipped and structural validation only."
+    "When annotations are missing, InferEdge records accuracy_skipped and structural validation only.",
+    "Accuracy metrics backend: simplified lightweight mAP50."
   ]
 }
diff --git a/examples/validation_demo/subset/yolov8_coco_subset_evaluation.md b/examples/validation_demo/subset/yolov8_coco_subset_evaluation.md
index 0e5bdf6..2e0dfa0 100644
--- a/examples/validation_demo/subset/yolov8_coco_subset_evaluation.md
+++ b/examples/validation_demo/subset/yolov8_coco_subset_evaluation.md
@@ -10,14 +10,17 @@
 - deployment signal: `review`
 
 ## Metrics
+- backend: `simplified`
 - map50: `0.14097840361885305`
 - map50_95: `0.08728567780534073`
 - f1_score: `0.21428571428571427`
 - precision: `0.29411764705882354`
 - recall: `0.16853932584269662`
+- note: `lightweight simplified mAP50 implementation`
 
 ## Notes
 - Detection evaluation uses image directory traversal.
 - YOLOv8 postprocessing supports single-output and split boxes/scores output layouts.
 - Accuracy uses YOLO txt labels or COCO annotations when provided.
 - When annotations are missing, InferEdge records accuracy_skipped and structural validation only.
+- Accuracy metrics backend: simplified lightweight mAP50.
diff --git a/inferedgelab/commands/evaluate_detection.py b/inferedgelab/commands/evaluate_detection.py
index 73b0379..51dc394 100644
--- a/inferedgelab/commands/evaluate_detection.py
+++ b/inferedgelab/commands/evaluate_detection.py
@@ -16,6 +16,9 @@
     supported_engines,
     supported_engines_display,
 )
+from inferedgelab.evaluation.metrics import MetricBackendError
+from inferedgelab.evaluation.metrics import get_metric_backend
+from inferedgelab.evaluation.metrics import supported_metric_backends
 from inferedgelab.result.saver import save_result
 from inferedgelab.result.schema import BenchmarkResult
 from inferedgelab.utils.system_info import collect_system_snapshot
@@ -48,6 +51,11 @@ def evaluate_detection_cmd(
     label_dir: str = typer.Option("", "--label-dir", help="YOLO txt 라벨 디렉토리"),
     coco_annotations: str = typer.Option("", "--coco-annotations", help="COCO annotation JSON 경로"),
     preset: str = typer.Option("yolov8_coco", "--preset", help="Validation preset 이름"),
+    metric_backend: str = typer.Option(
+        "simplified",
+        "--metric-backend",
+        help="Metric backend: simplified or pycocotools",
+    ),
     model_contract: str = typer.Option("", "--model-contract", help="model_contract.json 경로"),
     num_classes: int = typer.Option(1, "--num-classes", help="클래스 수"),
     precision: str = typer.Option("fp16", "--precision", help="precision 메타데이터 (fp32, fp16, int8)"),
@@ -86,11 +94,17 @@ def evaluate_detection_cmd(
         raise typer.BadParameter("--num-classes must be >= 1")
     coco_annotations = _option_string(coco_annotations)
     preset = _option_string(preset, "yolov8_coco")
+    metric_backend = _option_string(metric_backend, "simplified").strip().lower()
     model_contract = _option_string(model_contract)
     report_json = _option_string(report_json)
     report_md = _option_string(report_md)
     report_html = _option_string(report_html)
     preset = preset.strip().lower()
+    try:
+        get_metric_backend(metric_backend).ensure_available()
+    except MetricBackendError as exc:
+        supported = ", ".join(supported_metric_backends())
+        raise typer.BadParameter(f"{exc} Supported metric backends: {supported}") from exc
     try:
         preset_def = get_preset(preset)
         contract = (
@@ -121,6 +135,7 @@ def evaluate_detection_cmd(
             use_rgb=rgb,
             input_size=640,
             debug_samples=debug_samples,
+            metric_backend=metric_backend,
         )
     except RuntimeError as exc:
         _exit_with_runtime_error(str(exc))
@@ -156,6 +171,7 @@ def evaluate_detection_cmd(
                 "model_contract_path": model_contract.strip() or None,
                 "coco_annotations": coco_annotations.strip() or None,
                 "num_classes": num_classes,
+                "metric_backend": metric_backend,
             },
             accuracy=accuracy_payload,
             extra={
@@ -190,6 +206,7 @@ def evaluate_detection_cmd(
     rprint(f"Images          : {image_dir}")
     rprint(f"Labels          : {label_dir or '(not provided)'}")
     rprint(f"COCO annotations: {coco_annotations or '(not provided)'}")
+    rprint(f"Metric backend  : {eval_result.metrics.get('backend', metric_backend)}")
     rprint(f"Samples         : {eval_result.sample_count}")
     rprint(f"Accuracy status : {eval_result.extra.get('accuracy_status', 'evaluated')}")
     if eval_result.extra.get("accuracy_status") == "skipped":
diff --git a/inferedgelab/core/detection_evaluator.py b/inferedgelab/core/detection_evaluator.py
index 44688dc..9715a14 100644
--- a/inferedgelab/core/detection_evaluator.py
+++ b/inferedgelab/core/detection_evaluator.py
@@ -10,6 +10,8 @@
 
 from inferedgelab.engines.base import EngineModelIO
 from inferedgelab.engines.registry import create_engine, normalize_engine_name
+from inferedgelab.evaluation.metrics import MetricBackendError
+from inferedgelab.evaluation.metrics import get_metric_backend
 from inferedgelab.validation.coco import load_coco_ground_truths
 from inferedgelab.validation.structural import validate_detection_structure
 
@@ -33,7 +35,7 @@ class DetectionEvalResult:
     engine: str
     device: str
     sample_count: int
-    metrics: Dict[str, float]
+    metrics: Dict[str, Any]
     notes: List[str]
     model_input: Dict[str, Any]
     actual_input_shape: List[int]
@@ -810,8 +812,11 @@ def evaluate_detection_engine(
     use_rgb: bool = True,
     input_size: int = 640,
     debug_samples: int = 0,
+    metric_backend: str = "simplified",
 ) -> DetectionEvalResult:
     engine_name = normalize_engine_name(engine_name)
+    metric_backend_impl = get_metric_backend(metric_backend)
+    metric_backend_impl.ensure_available()
     engine = create_engine(engine_name)
 
     load_kwargs: dict[str, Any] = {}
@@ -912,53 +917,55 @@ def evaluate_detection_engine(
         )
 
         if accuracy_status == "evaluated":
-            precision, recall, f1_score = compute_precision_recall_f1(
-                predictions_by_image,
-                ground_truths_by_image,
-                num_classes=num_classes,
-                iou_threshold=iou_threshold,
-            )
-            map50 = compute_average_precision(
-                predictions_by_image,
-                ground_truths_by_image,
-                num_classes=num_classes,
-                iou_threshold=0.5,
-            )
-            map_thresholds = np.arange(0.5, 1.0, 0.05)
-            map50_95 = float(
-                np.mean(
-                    [
-                        compute_average_precision(
-                            predictions_by_image,
-                            ground_truths_by_image,
-                            num_classes=num_classes,
-                            iou_threshold=float(threshold),
-                        )
-                        for threshold in map_thresholds
-                    ]
+            try:
+                backend_result = metric_backend_impl.evaluate(
+                    predictions_by_image=predictions_by_image,
+                    ground_truths_by_image=ground_truths_by_image,
+                    num_classes=num_classes,
+                    iou_threshold=iou_threshold,
+                    average_precision_fn=compute_average_precision,
+                    precision_recall_fn=compute_precision_recall_f1,
+                    mean_fn=lambda values: float(np.mean(values)),
                 )
-            )
+            except MetricBackendError:
+                raise
+            metrics = backend_result.metrics
+            metric_notes = backend_result.notes
+            metric_warnings = backend_result.warnings
         else:
-            precision = recall = f1_score = map50 = map50_95 = 0.0
+            metrics = {
+                "backend": metric_backend_impl.name,
+                "map50": 0.0,
+                "map50_95": 0.0,
+                "f1_score": 0.0,
+                "precision": 0.0,
+                "recall": 0.0,
+                "note": (
+                    "lightweight simplified mAP50 implementation"
+                    if metric_backend_impl.name == "simplified"
+                    else "accuracy skipped before metric backend execution"
+                ),
+            }
+            metric_notes = [f"Accuracy metrics backend: {metric_backend_impl.name}."]
+            metric_warnings = []
+
+        notes = [
+            "Detection evaluation uses image directory traversal.",
+            "YOLOv8 postprocessing supports single-output and split boxes/scores output layouts.",
+            "Accuracy uses YOLO txt labels or COCO annotations when provided.",
+            "When annotations are missing, InferEdge records accuracy_skipped and structural validation only.",
+            *metric_notes,
+        ]
+        if metric_warnings:
+            notes.extend(f"Metric warning: {warning}" for warning in metric_warnings)
 
         return DetectionEvalResult(
             task="detection",
             engine=engine.name,
             device=engine.device,
             sample_count=len(image_files),
-            metrics={
-                "map50": map50,
-                "map50_95": map50_95,
-                "f1_score": f1_score,
-                "precision": precision,
-                "recall": recall,
-            },
-            notes=[
-                "Detection evaluation uses image directory traversal.",
-                "YOLOv8 postprocessing supports single-output and split boxes/scores output layouts.",
-                "Accuracy uses YOLO txt labels or COCO annotations when provided.",
-                "When annotations are missing, InferEdge records accuracy_skipped and structural validation only.",
-            ],
+            metrics=metrics,
+            notes=notes,
             model_input={
                 "name": model_input.name,
                 "dtype": str(model_input.dtype),
@@ -979,6 +986,7 @@ def evaluate_detection_engine(
                 "input_size": input_size,
                 "rgb": use_rgb,
                 "num_classes": num_classes,
+                "metric_backend": metric_backend_impl.name,
             },
             extra={
                 "engine_path": engine_path,
@@ -991,6 +999,8 @@ def evaluate_detection_engine(
                 "accuracy_status": accuracy_status,
                 "accuracy_skip_reason": accuracy_skip_reason,
                 "structural_validation": structural_validation,
+                "metric_backend": metric_backend_impl.name,
+                "metric_warnings": metric_warnings,
             },
         )
     finally:
diff --git a/inferedgelab/evaluation/__init__.py b/inferedgelab/evaluation/__init__.py
new file mode 100644
index 0000000..52cde34
--- /dev/null
+++ b/inferedgelab/evaluation/__init__.py
@@ -0,0 +1,11 @@
+"""Metric backend helpers for InferEdge evaluation."""
+
+from inferedgelab.evaluation.metrics import MetricBackendError
+from inferedgelab.evaluation.metrics import get_metric_backend
+from inferedgelab.evaluation.metrics import supported_metric_backends
+
+__all__ = [
+    "MetricBackendError",
+    "get_metric_backend",
+    "supported_metric_backends",
+]
diff --git a/inferedgelab/evaluation/coco_eval.py b/inferedgelab/evaluation/coco_eval.py
new file mode 100644
index 0000000..54e5c49
--- /dev/null
+++ b/inferedgelab/evaluation/coco_eval.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+from typing import Any
+
+
+def build_metric_payload(
+    *,
+    backend: str,
+    metrics: dict[str, Any],
+    note: str | None = None,
+    warnings: list[str] | None = None,
+) -> dict[str, Any]:
+    payload = {"backend": backend, **metrics}
+    if note:
+        payload["note"] = note
+    if warnings:
+        payload["warnings"] = list(warnings)
+    return payload
diff --git a/inferedgelab/evaluation/metrics.py b/inferedgelab/evaluation/metrics.py
new file mode 100644
index 0000000..3b4125f
--- /dev/null
+++ b/inferedgelab/evaluation/metrics.py
@@ -0,0 +1,260 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Callable, Protocol, Sequence
+
+import numpy as np
+
+from inferedgelab.evaluation.coco_eval import build_metric_payload
+from inferedgelab.evaluation.pycocotools_backend import PycocotoolsUnavailableError
+from inferedgelab.evaluation.pycocotools_backend import require_pycocotools
+
+
+class MetricBackendError(RuntimeError):
+    """Raised when a metric backend cannot evaluate the requested payload."""
+
+
+class AveragePrecisionFn(Protocol):
+    def __call__(
+        self,
+        predictions_by_image: Sequence[Any],
+        ground_truths_by_image: Sequence[Any],
+        *,
+        num_classes: int,
+        iou_threshold: float,
+    ) -> float: ...
+
+
+class PrecisionRecallFn(Protocol):
+    def __call__(
+        self,
+        predictions_by_image: Sequence[Any],
+        ground_truths_by_image: Sequence[Any],
+        *,
+        num_classes: int,
+        iou_threshold: float,
+    ) -> tuple[float, float, float]: ...
+
+
+@dataclass(frozen=True)
+class MetricBackendResult:
+    metrics: dict[str, Any]
+    notes: list[str]
+    warnings: list[str]
+
+
+class MetricBackend(Protocol):
+    name: str
+
+    def ensure_available(self) -> None: ...
+
+    def evaluate(
+        self,
+        *,
+        predictions_by_image: Sequence[Any],
+        ground_truths_by_image: Sequence[Any],
+        num_classes: int,
+        iou_threshold: float,
+        average_precision_fn: AveragePrecisionFn,
+        precision_recall_fn: PrecisionRecallFn,
+        mean_fn: Callable[[list[float]], float],
+    ) -> MetricBackendResult: ...
+
+
+class SimplifiedMap50Backend:
+    name = "simplified"
+
+    def ensure_available(self) -> None:
+        return None
+
+    def evaluate(
+        self,
+        *,
+        predictions_by_image: Sequence[Any],
+        ground_truths_by_image: Sequence[Any],
+        num_classes: int,
+        iou_threshold: float,
+        average_precision_fn: AveragePrecisionFn,
+        precision_recall_fn: PrecisionRecallFn,
+        mean_fn: Callable[[list[float]], float],
+    ) -> MetricBackendResult:
+        precision, recall, f1_score = precision_recall_fn(
+            predictions_by_image,
+            ground_truths_by_image,
+            num_classes=num_classes,
+            iou_threshold=iou_threshold,
+        )
+        map50 = average_precision_fn(
+            predictions_by_image,
+            ground_truths_by_image,
+            num_classes=num_classes,
+            iou_threshold=0.5,
+        )
+        thresholds = [round(0.5 + 0.05 * index, 2) for index in range(10)]
+        map50_95 = float(
+            mean_fn(
+                [
+                    average_precision_fn(
+                        predictions_by_image,
+                        ground_truths_by_image,
+                        num_classes=num_classes,
+                        iou_threshold=float(threshold),
+                    )
+                    for threshold in thresholds
+                ]
+            )
+        )
+        return MetricBackendResult(
+            metrics=build_metric_payload(
+                backend=self.name,
+                metrics={
+                    "map50": float(map50),
+                    "map50_95": map50_95,
+                    "f1_score": float(f1_score),
+                    "precision": float(precision),
+                    "recall": float(recall),
+                },
+                note="lightweight simplified mAP50 implementation",
+            ),
+            notes=["Accuracy metrics backend: simplified lightweight mAP50."],
+            warnings=[],
+        )
+
+
+class PycocotoolsBackend:
+    name = "pycocotools"
+
+    def ensure_available(self) -> None:
+        try:
+            require_pycocotools()
+        except PycocotoolsUnavailableError as exc:
+            raise MetricBackendError(str(exc)) from exc
+
+    def evaluate(
+        self,
+        *,
+        predictions_by_image: Sequence[Any],
+        ground_truths_by_image: Sequence[Any],
+        num_classes: int,
+        iou_threshold: float,
+        average_precision_fn: AveragePrecisionFn,
+        precision_recall_fn: PrecisionRecallFn,
+        mean_fn: Callable[[list[float]], float],
+    ) -> MetricBackendResult:
+        modules = require_pycocotools()
+        coco_cls = modules["COCO"]
+        cocoeval_cls = modules["COCOeval"]
+
+        images = [{"id": index + 1} for index, _ in enumerate(ground_truths_by_image)]
+        categories = [{"id": class_id, "name": str(class_id)} for class_id in range(num_classes)]
+        annotations: list[dict[str, Any]] = []
+        detections: list[dict[str, Any]] = []
+
+        annotation_id = 1
+        for image_index, ground_truths in enumerate(ground_truths_by_image, start=1):
+            for ground_truth in ground_truths:
+                box = _xyxy_to_xywh(getattr(ground_truth, "box"))
+                annotations.append(
+                    {
+                        "id": annotation_id,
+                        "image_id": image_index,
+                        "category_id": int(getattr(ground_truth, "class_id")),
+                        "bbox": box,
+                        "area": box[2] * box[3],
+                        "iscrowd": 0,
+                    }
+                )
+                annotation_id += 1
+
+        for image_index, predictions in enumerate(predictions_by_image, start=1):
+            for prediction in predictions:
+                detections.append(
+                    {
+                        "image_id": image_index,
+                        "category_id": int(getattr(prediction, "class_id")),
+                        "bbox": _xyxy_to_xywh(getattr(prediction, "box")),
+                        "score": float(getattr(prediction, "confidence")),
+                    }
+                )
+
+        if not annotations:
+            return MetricBackendResult(
+                metrics=build_metric_payload(
+                    backend=self.name,
+                    metrics={
+                        "map50": 0.0,
+                        "map50_95": 0.0,
+                        "f1_score": 0.0,
+                        "precision": 0.0,
+                        "recall": 0.0,
+                    },
+                    warnings=["No COCO annotations were available for pycocotools evaluation."],
+                ),
+                notes=["Accuracy metrics backend: pycocotools."],
+                warnings=["No COCO annotations were available for pycocotools evaluation."],
+            )
+
+        coco_gt = coco_cls()
+        coco_gt.dataset = {
+            "images": images,
+            "annotations": annotations,
+            "categories": categories,
+            "info": {},
+            "licenses": [],
+        }
+        coco_gt.createIndex()
+
+        coco_dt = coco_gt.loadRes(detections) if detections else coco_gt.loadRes([])
+        coco_eval = cocoeval_cls(coco_gt, coco_dt, "bbox")
+        coco_eval.params.catIds = [item["id"] for item in categories]
+        coco_eval.params.imgIds = [item["id"] for item in images]
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+
+        precision_values = coco_eval.eval["precision"]
+        recall_values = coco_eval.eval["recall"]
+        valid_precision = precision_values[precision_values > -1]
+        valid_recall = recall_values[recall_values > -1]
+        map50_95 = float(np.mean(valid_precision)) if valid_precision.size else 0.0
+        map50_precision = precision_values[0]
+        valid_map50_precision = map50_precision[map50_precision > -1]
+        map50 = float(np.mean(valid_map50_precision)) if valid_map50_precision.size else 0.0
+        recall = float(np.mean(valid_recall)) if valid_recall.size else 0.0
+        precision = map50
+        f1_score = 0.0
+        if precision + recall > 0:
+            f1_score = 2.0 * precision * recall / (precision + recall)
+
+        return MetricBackendResult(
+            metrics=build_metric_payload(
+                backend=self.name,
+                metrics={
+                    "map50": map50,
+                    "map50_95": map50_95,
+                    "f1_score": float(f1_score),
+                    "precision": precision,
+                    "recall": recall,
+                },
+            ),
+            notes=["Accuracy metrics backend: pycocotools official COCO evaluator."],
+            warnings=[],
+        )
+
+
+def supported_metric_backends() -> tuple[str, ...]:
+    return ("simplified", "pycocotools")
+
+
+def get_metric_backend(name: str) -> MetricBackend:
+    normalized = name.strip().lower()
+    if normalized == "simplified":
+        return SimplifiedMap50Backend()
+    if normalized == "pycocotools":
+        return PycocotoolsBackend()
+    supported = ", ".join(supported_metric_backends())
+    raise MetricBackendError(f"unsupported metric backend: {name}. Supported backends: {supported}")
+
+
+def _xyxy_to_xywh(box: Sequence[float]) -> list[float]:
+    x1, y1, x2, y2 = [float(value) for value in box]
+    return [x1, y1, max(0.0, x2 - x1), max(0.0, y2 - y1)]
diff --git a/inferedgelab/evaluation/pycocotools_backend.py b/inferedgelab/evaluation/pycocotools_backend.py
new file mode 100644
index 0000000..db18fb7
--- /dev/null
+++ b/inferedgelab/evaluation/pycocotools_backend.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+import importlib
+from typing import Any
+
+
+class PycocotoolsUnavailableError(RuntimeError):
+    """Raised when the optional pycocotools backend is requested but unavailable."""
+
+
+def require_pycocotools() -> dict[str, Any]:
+    try:
+        coco_module = importlib.import_module("pycocotools.coco")
+        cocoeval_module = importlib.import_module("pycocotools.cocoeval")
+    except ImportError as exc:
+        raise PycocotoolsUnavailableError(
+            "pycocotools backend requested but pycocotools is not installed. "
+            "Hint: pip install pycocotools"
+        ) from exc
+
+    return {
+        "COCO": getattr(coco_module, "COCO"),
+        "COCOeval": getattr(cocoeval_module, "COCOeval"),
+    }
diff --git a/tests/test_evaluate_detection.py b/tests/test_evaluate_detection.py
index 9adc24e..52a4e83 100644
--- a/tests/test_evaluate_detection.py
+++ b/tests/test_evaluate_detection.py
@@ -169,11 +169,13 @@ def test_accuracy_payload_save_keeps_task_and_metrics_structure(tmp_path):
             device="gpu",
             sample_count=2,
             metrics={
+                "backend": "simplified",
                 "map50": 0.9,
                 "map50_95": 0.7,
                 "f1_score": 0.8,
                 "precision": 0.85,
                 "recall": 0.75,
+                "note": "lightweight simplified mAP50 implementation",
             },
             notes=[],
             model_input={"name": "images", "dtype": "float32", "shape": [1, 3, 640, 640]},
@@ -195,6 +197,7 @@ def test_accuracy_payload_save_keeps_task_and_metrics_structure(tmp_path):
 
     saved = json.loads(out_json.read_text(encoding="utf-8"))
     assert saved["task"] == "detection"
+    assert saved["metrics"]["backend"] == "simplified"
     assert saved["metrics"]["map50"] == pytest.approx(0.9)
     assert saved["metrics"]["f1_score"] == pytest.approx(0.8)
     assert saved["dataset"]["sample_count"] == 2
@@ -213,11 +216,13 @@ def fake_evaluate_detection_engine(**kwargs):
             device="gpu",
             sample_count=3,
             metrics={
+                "backend": "simplified",
                 "map50": 0.7791,
                 "map50_95": 0.5512,
                 "f1_score": 0.8180,
                 "precision": 0.7950,
                 "recall": 0.8424,
+                "note": "lightweight simplified mAP50 implementation",
             },
             notes=[],
             model_input={"name": "images", "dtype": "float16", "shape": [1, 3, 640, 640]},
@@ -266,11 +271,14 @@ def fake_save_result(result, out_dir="results"):
 
     payload = json.loads(out_json.read_text(encoding="utf-8"))
     assert payload["task"] == "detection"
+    assert payload["metrics"]["backend"] == "simplified"
     assert payload["metrics"]["map50"] == pytest.approx(0.7791)
     assert captured["result"].accuracy["task"] == "detection"
     assert captured["result"].accuracy["metrics"]["map50"] == pytest.approx(0.7791)
     assert captured["result"].run_config["mode"] == "evaluate-detection"
+    assert captured["result"].run_config["metric_backend"] == "simplified"
     assert captured["engine_kwargs"]["debug_samples"] == 0
+    assert captured["engine_kwargs"]["metric_backend"] == "simplified"
 
 
 def test_evaluate_detection_command_writes_contract_evaluation_report(tmp_path, monkeypatch):
@@ -286,11 +294,13 @@ def fake_evaluate_detection_engine(**kwargs):
             device="cpu",
             sample_count=1,
             metrics={
+                "backend": "simplified",
                 "map50": 0.0,
                 "map50_95": 0.0,
                 "f1_score": 0.0,
                 "precision": 0.0,
                 "recall": 0.0,
+                "note": "lightweight simplified mAP50 implementation",
             },
             notes=["structural validation only"],
             model_input={"name": "images", "dtype": "float32", "shape": [1, 3, 640, 640]},
@@ -341,8 +351,10 @@ def fake_evaluate_detection_engine(**kwargs):
     report = json.loads(report_json.read_text(encoding="utf-8"))
     assert captured["engine_kwargs"]["label_dir"] is None
     assert captured["engine_kwargs"]["coco_annotations"] is None
+    assert captured["engine_kwargs"]["metric_backend"] == "simplified"
     assert report["model_contract"]["preset"] == "yolov8_coco"
     assert report["accuracy"]["status"] == "skipped"
+    assert report["accuracy"]["metrics"]["backend"] == "simplified"
     assert "accuracy skipped reason" in report_md.read_text(encoding="utf-8")
 
 
@@ -441,6 +453,36 @@ def test_evaluate_detection_help_shows_debug_samples_option():
         assert "--model-contract" in result.stdout
         assert "--preset" in result.stdout
         assert "--coco-annotations" in result.stdout
+        assert "--metric-backend" in result.stdout
+
+
+def test_evaluate_detection_command_rejects_unsupported_metric_backend():
+    from inferedgelab.commands import evaluate_detection
+
+    with pytest.raises(Exception, match="unsupported metric backend"):
+        evaluate_detection.evaluate_detection_cmd(
+            model_path="models/onnx/yolov8n.onnx",
+            engine="onnxruntime",
+            engine_path="",
+            image_dir="images",
+            label_dir="labels",
+            metric_backend="made_up_backend",
+            preset="yolov8_coco",
+            model_contract="",
+            num_classes=1,
+            precision="fp32",
+            conf_threshold=0.2,
+            nms_threshold=0.45,
+            iou_threshold=0.5,
+            rgb=True,
+            debug_samples=0,
+            out_json="",
+            report_json="",
+            report_md="",
+            report_html="",
+            out_dir="results",
+            save_structured_result=False,
+        )
 
 
 def test_cli_help_registers_evaluate_detection_command():
diff --git a/tests/test_metric_backends.py b/tests/test_metric_backends.py
new file mode 100644
index 0000000..7b11a3e
--- /dev/null
+++ b/tests/test_metric_backends.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+import importlib
+
+import pytest
+
+from inferedgelab.evaluation.metrics import MetricBackendError
+from inferedgelab.evaluation.metrics import get_metric_backend
+from inferedgelab.evaluation.metrics import supported_metric_backends
+from inferedgelab.evaluation.pycocotools_backend import require_pycocotools
+
+
+def test_simplified_metric_backend_records_backend_and_note():
+    backend = get_metric_backend("simplified")
+
+    result = backend.evaluate(
+        predictions_by_image=[],
+        ground_truths_by_image=[],
+        num_classes=1,
+        iou_threshold=0.5,
+        average_precision_fn=lambda *args, **kwargs: 0.68,
+        precision_recall_fn=lambda *args, **kwargs: (0.7, 0.6, 0.646),
+        mean_fn=lambda values: sum(values) / len(values),
+    )
+
+    assert result.metrics["backend"] == "simplified"
+    assert result.metrics["map50"] == pytest.approx(0.68)
+    assert result.metrics["precision"] == pytest.approx(0.7)
+    assert result.metrics["recall"] == pytest.approx(0.6)
+    assert result.metrics["note"] == "lightweight simplified mAP50 implementation"
+
+
+def test_unsupported_metric_backend_fails_clearly():
+    with pytest.raises(MetricBackendError, match="unsupported metric backend"):
+        get_metric_backend("made_up_backend")
+
+
+def test_pycocotools_backend_requested_without_dependency_fails_clearly(monkeypatch):
+    def fake_import_module(name: str):
+        if name.startswith("pycocotools"):
+            raise ImportError("missing pycocotools")
+        return importlib.import_module(name)
+
+    monkeypatch.setattr(importlib, "import_module", fake_import_module)
+
+    with pytest.raises(RuntimeError, match="pycocotools backend requested but pycocotools is not installed"):
+        require_pycocotools()
+
+
+def test_supported_metric_backends_include_simplified_and_pycocotools():
+    assert supported_metric_backends() == ("simplified", "pycocotools")
diff --git a/tests/test_validation_demo_report.py b/tests/test_validation_demo_report.py
index 4591f2d..ed39aa0 100644
--- a/tests/test_validation_demo_report.py
+++ b/tests/test_validation_demo_report.py
@@ -17,6 +17,8 @@ def test_yolov8_coco_subset_demo_report_contains_evaluated_accuracy():
     assert report["preset"]["name"] == "yolov8_coco"
     assert report["runtime_result"]["sample_count"] == 10
     assert report["accuracy"]["status"] == "evaluated"
+    assert report["accuracy"]["metrics"]["backend"] == "simplified"
+    assert report["accuracy"]["metrics"]["note"] == "lightweight simplified mAP50 implementation"
     assert round(report["accuracy"]["metrics"]["map50"], 4) == 0.141
     assert round(report["accuracy"]["metrics"]["precision"], 4) == 0.2941
     assert round(report["accuracy"]["metrics"]["recall"], 4) == 0.1685