gwonxhj · hyeokjun32 · May 1, 2026 · May 1, 2026
diff --git a/README.md b/README.md
@@ -197,6 +197,7 @@ CLI / API → Service Layer → Structured Result → Compare / Report
 
 InferEdgeLab treats model evaluation as a **contract/preset-based validation workflow**, not as a claim that any arbitrary model can be automatically scored without context.
 `evaluate-detection` now supports the `yolov8_coco` preset, optional `model_contract.json`, COCO annotations, YOLO txt labels, structural detection-output validation, and JSON/Markdown/HTML evaluation reports.
+Metric evaluation defaults to the lightweight `--metric-backend simplified` path and can explicitly request `--metric-backend pycocotools` when the optional `pycocotools` package is installed.
 When annotations are not provided, accuracy is explicitly marked as `skipped` and the report records structural validation only.
 
 Planned presets such as `resnet_imagenet` and `custom_contract` keep future evaluation work scoped to explicit model contracts and dataset assumptions.

diff --git a/docs/portfolio/final_validation_completion.md b/docs/portfolio/final_validation_completion.md
@@ -70,7 +70,7 @@ These are intentionally outside the current completion boundary:
 - file upload product flow
 - production frontend deployment
 - authentication, billing, and multi-user controls
-- full COCO official evaluation
+- making optional official COCO evaluation a required dependency
 - more presets such as `resnet_imagenet`
 
 ## Portfolio Message

diff --git a/docs/portfolio/yolov8_coco_subset_evaluation.md b/docs/portfolio/yolov8_coco_subset_evaluation.md
@@ -20,6 +20,7 @@ It is not a full COCO benchmark and should not be presented as production model
 | Samples | 10 |
 | Ground-truth boxes | 89 |
 | Post-NMS detections checked | 51 |
+| Metric backend | simplified |
 | mAP@50 | 0.1410 |
 | mAP@50-95 | 0.0873 |
 | Precision | 0.2941 |
@@ -31,6 +32,7 @@ It is not a full COCO benchmark and should not be presented as production model
 ## Interpretation
 
 This demo proves that InferEdgeLab can load COCO-style annotations, run the YOLOv8 detection evaluator, compute simplified accuracy metrics, validate detection output structure, and emit JSON/Markdown/HTML reports.
+The report records `metrics.backend = simplified`; `pycocotools` remains an optional explicit backend rather than a required dependency.
 The numbers are intentionally documented as a small subset result only.
 They are useful as portfolio workflow evidence, not as a claim of full COCO accuracy.
 

diff --git a/examples/validation_demo/subset/yolov8_coco_subset_evaluation.html b/examples/validation_demo/subset/yolov8_coco_subset_evaluation.html
@@ -11,15 +11,18 @@
 - deployment signal: `review`
 
 ## Metrics
+- backend: `simplified`
 - map50: `0.14097840361885305`
 - map50_95: `0.08728567780534073`
 - f1_score: `0.21428571428571427`
 - precision: `0.29411764705882354`
 - recall: `0.16853932584269662`
+- note: `lightweight simplified mAP50 implementation`
 
 ## Notes
 - Detection evaluation uses image directory traversal.
 - YOLOv8 postprocessing supports single-output and split boxes/scores output layouts.
 - Accuracy uses YOLO txt labels or COCO annotations when provided.
 - When annotations are missing, InferEdge records accuracy_skipped and structural validation only.
+- Accuracy metrics backend: simplified lightweight mAP50.
 </pre></body></html>
diff --git a/examples/validation_demo/subset/yolov8_coco_subset_evaluation.json b/examples/validation_demo/subset/yolov8_coco_subset_evaluation.json
@@ -260,11 +260,13 @@
   "accuracy": {
     "status": "evaluated",
     "metrics": {
+      "backend": "simplified",
       "map50": 0.14097840361885305,
       "map50_95": 0.08728567780534073,
       "f1_score": 0.21428571428571427,
       "precision": 0.29411764705882354,
-      "recall": 0.16853932584269662
+      "recall": 0.16853932584269662,
+      "note": "lightweight simplified mAP50 implementation"
     },
     "reason": null
   },
@@ -307,6 +309,7 @@
     "Detection evaluation uses image directory traversal.",
     "YOLOv8 postprocessing supports single-output and split boxes/scores output layouts.",
     "Accuracy uses YOLO txt labels or COCO annotations when provided.",
-    "When annotations are missing, InferEdge records accuracy_skipped and structural validation only."
+    "When annotations are missing, InferEdge records accuracy_skipped and structural validation only.",
+    "Accuracy metrics backend: simplified lightweight mAP50."
   ]
 }
diff --git a/examples/validation_demo/subset/yolov8_coco_subset_evaluation.md b/examples/validation_demo/subset/yolov8_coco_subset_evaluation.md
@@ -10,14 +10,17 @@
 - deployment signal: `review`
 
 ## Metrics
+- backend: `simplified`
 - map50: `0.14097840361885305`
 - map50_95: `0.08728567780534073`
 - f1_score: `0.21428571428571427`
 - precision: `0.29411764705882354`
 - recall: `0.16853932584269662`
+- note: `lightweight simplified mAP50 implementation`
 
 ## Notes
 - Detection evaluation uses image directory traversal.
 - YOLOv8 postprocessing supports single-output and split boxes/scores output layouts.
 - Accuracy uses YOLO txt labels or COCO annotations when provided.
 - When annotations are missing, InferEdge records accuracy_skipped and structural validation only.
+- Accuracy metrics backend: simplified lightweight mAP50.
diff --git a/inferedgelab/commands/evaluate_detection.py b/inferedgelab/commands/evaluate_detection.py
@@ -16,6 +16,9 @@
     supported_engines,
     supported_engines_display,
 )
+from inferedgelab.evaluation.metrics import MetricBackendError
+from inferedgelab.evaluation.metrics import get_metric_backend
+from inferedgelab.evaluation.metrics import supported_metric_backends
 from inferedgelab.result.saver import save_result
 from inferedgelab.result.schema import BenchmarkResult
 from inferedgelab.utils.system_info import collect_system_snapshot
@@ -48,6 +51,11 @@ def evaluate_detection_cmd(
     label_dir: str = typer.Option("", "--label-dir", help="YOLO txt 라벨 디렉토리"),
     coco_annotations: str = typer.Option("", "--coco-annotations", help="COCO annotation JSON 경로"),
     preset: str = typer.Option("yolov8_coco", "--preset", help="Validation preset 이름"),
+    metric_backend: str = typer.Option(
+        "simplified",
+        "--metric-backend",
+        help="Metric backend: simplified or pycocotools",
+    ),
     model_contract: str = typer.Option("", "--model-contract", help="model_contract.json 경로"),
     num_classes: int = typer.Option(1, "--num-classes", help="클래스 수"),
     precision: str = typer.Option("fp16", "--precision", help="precision 메타데이터 (fp32, fp16, int8)"),
@@ -86,11 +94,17 @@ def evaluate_detection_cmd(
         raise typer.BadParameter("--num-classes must be >= 1")
     coco_annotations = _option_string(coco_annotations)
     preset = _option_string(preset, "yolov8_coco")
+    metric_backend = _option_string(metric_backend, "simplified").strip().lower()
     model_contract = _option_string(model_contract)
     report_json = _option_string(report_json)
     report_md = _option_string(report_md)
     report_html = _option_string(report_html)
     preset = preset.strip().lower()
+    try:
+        get_metric_backend(metric_backend).ensure_available()
+    except MetricBackendError as exc:
+        supported = ", ".join(supported_metric_backends())
+        raise typer.BadParameter(f"{exc} Supported metric backends: {supported}") from exc
     try:
         preset_def = get_preset(preset)
         contract = (
@@ -121,6 +135,7 @@ def evaluate_detection_cmd(
             use_rgb=rgb,
             input_size=640,
             debug_samples=debug_samples,
+            metric_backend=metric_backend,
         )
     except RuntimeError as exc:
         _exit_with_runtime_error(str(exc))
@@ -156,6 +171,7 @@ def evaluate_detection_cmd(
                 "model_contract_path": model_contract.strip() or None,
                 "coco_annotations": coco_annotations.strip() or None,
                 "num_classes": num_classes,
+                "metric_backend": metric_backend,
             },
             accuracy=accuracy_payload,
             extra={
@@ -190,6 +206,7 @@ def evaluate_detection_cmd(
     rprint(f"Images          : {image_dir}")
     rprint(f"Labels          : {label_dir or '(not provided)'}")
     rprint(f"COCO annotations: {coco_annotations or '(not provided)'}")
+    rprint(f"Metric backend  : {eval_result.metrics.get('backend', metric_backend)}")
     rprint(f"Samples         : {eval_result.sample_count}")
     rprint(f"Accuracy status : {eval_result.extra.get('accuracy_status', 'evaluated')}")
     if eval_result.extra.get("accuracy_status") == "skipped":

diff --git a/inferedgelab/core/detection_evaluator.py b/inferedgelab/core/detection_evaluator.py
@@ -10,6 +10,8 @@
 
 from inferedgelab.engines.base import EngineModelIO
 from inferedgelab.engines.registry import create_engine, normalize_engine_name
+from inferedgelab.evaluation.metrics import MetricBackendError
+from inferedgelab.evaluation.metrics import get_metric_backend
 from inferedgelab.validation.coco import load_coco_ground_truths
 from inferedgelab.validation.structural import validate_detection_structure
 
@@ -33,7 +35,7 @@ class DetectionEvalResult:
     engine: str
     device: str
     sample_count: int
-    metrics: Dict[str, float]
+    metrics: Dict[str, Any]
     notes: List[str]
     model_input: Dict[str, Any]
     actual_input_shape: List[int]
@@ -810,8 +812,11 @@ def evaluate_detection_engine(
     use_rgb: bool = True,
     input_size: int = 640,
     debug_samples: int = 0,
+    metric_backend: str = "simplified",
 ) -> DetectionEvalResult:
     engine_name = normalize_engine_name(engine_name)
+    metric_backend_impl = get_metric_backend(metric_backend)
+    metric_backend_impl.ensure_available()
     engine = create_engine(engine_name)
 
     load_kwargs: dict[str, Any] = {}
@@ -912,53 +917,55 @@ def evaluate_detection_engine(
         )
 
         if accuracy_status == "evaluated":
-            precision, recall, f1_score = compute_precision_recall_f1(
-                predictions_by_image,
-                ground_truths_by_image,
-                num_classes=num_classes,
-                iou_threshold=iou_threshold,
-            )
-            map50 = compute_average_precision(
-                predictions_by_image,
-                ground_truths_by_image,
-                num_classes=num_classes,
-                iou_threshold=0.5,
-            )
-            map_thresholds = np.arange(0.5, 1.0, 0.05)
-            map50_95 = float(
-                np.mean(
-                    [
-                        compute_average_precision(
-                            predictions_by_image,
-                            ground_truths_by_image,
-                            num_classes=num_classes,
-                            iou_threshold=float(threshold),
-                        )
-                        for threshold in map_thresholds
-                    ]
+            try:
+                backend_result = metric_backend_impl.evaluate(
+                    predictions_by_image=predictions_by_image,
+                    ground_truths_by_image=ground_truths_by_image,
+                    num_classes=num_classes,
+                    iou_threshold=iou_threshold,
+                    average_precision_fn=compute_average_precision,
+                    precision_recall_fn=compute_precision_recall_f1,
+                    mean_fn=lambda values: float(np.mean(values)),
                 )
-            )
+            except MetricBackendError:
+                raise
+            metrics = backend_result.metrics
+            metric_notes = backend_result.notes
+            metric_warnings = backend_result.warnings
         else:
-            precision = recall = f1_score = map50 = map50_95 = 0.0
+            metrics = {
+                "backend": metric_backend_impl.name,
+                "map50": 0.0,
+                "map50_95": 0.0,
+                "f1_score": 0.0,
+                "precision": 0.0,
+                "recall": 0.0,
+                "note": (
+                    "lightweight simplified mAP50 implementation"
+                    if metric_backend_impl.name == "simplified"
+                    else "accuracy skipped before metric backend execution"
+                ),
+            }
+            metric_notes = [f"Accuracy metrics backend: {metric_backend_impl.name}."]
+            metric_warnings = []
+
+        notes = [
+            "Detection evaluation uses image directory traversal.",
+            "YOLOv8 postprocessing supports single-output and split boxes/scores output layouts.",
+            "Accuracy uses YOLO txt labels or COCO annotations when provided.",
+            "When annotations are missing, InferEdge records accuracy_skipped and structural validation only.",
+            *metric_notes,
+        ]
+        if metric_warnings:
+            notes.extend(f"Metric warning: {warning}" for warning in metric_warnings)
 
         return DetectionEvalResult(
             task="detection",
             engine=engine.name,
             device=engine.device,
             sample_count=len(image_files),
-            metrics={
-                "map50": map50,
-                "map50_95": map50_95,
-                "f1_score": f1_score,
-                "precision": precision,
-                "recall": recall,
-            },
-            notes=[
-                "Detection evaluation uses image directory traversal.",
-                "YOLOv8 postprocessing supports single-output and split boxes/scores output layouts.",
-                "Accuracy uses YOLO txt labels or COCO annotations when provided.",
-                "When annotations are missing, InferEdge records accuracy_skipped and structural validation only.",
-            ],
+            metrics=metrics,
+            notes=notes,
             model_input={
                 "name": model_input.name,
                 "dtype": str(model_input.dtype),
@@ -979,6 +986,7 @@ def evaluate_detection_engine(
                 "input_size": input_size,
                 "rgb": use_rgb,
                 "num_classes": num_classes,
+                "metric_backend": metric_backend_impl.name,
             },
             extra={
                 "engine_path": engine_path,
@@ -991,6 +999,8 @@ def evaluate_detection_engine(
                 "accuracy_status": accuracy_status,
                 "accuracy_skip_reason": accuracy_skip_reason,
                 "structural_validation": structural_validation,
+                "metric_backend": metric_backend_impl.name,
+                "metric_warnings": metric_warnings,
             },
         )
     finally:

diff --git a/inferedgelab/evaluation/__init__.py b/inferedgelab/evaluation/__init__.py
@@ -0,0 +1,11 @@
+"""Metric backend helpers for InferEdge evaluation."""
+
+from inferedgelab.evaluation.metrics import MetricBackendError
+from inferedgelab.evaluation.metrics import get_metric_backend
+from inferedgelab.evaluation.metrics import supported_metric_backends
+
+__all__ = [
+    "MetricBackendError",
+    "get_metric_backend",
+    "supported_metric_backends",
+]
diff --git a/inferedgelab/evaluation/coco_eval.py b/inferedgelab/evaluation/coco_eval.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+from typing import Any
+
+
+def build_metric_payload(
+    *,
+    backend: str,
+    metrics: dict[str, Any],
+    note: str | None = None,
+    warnings: list[str] | None = None,
+) -> dict[str, Any]:
+    payload = {"backend": backend, **metrics}
+    if note:
+        payload["note"] = note
+    if warnings:
+        payload["warnings"] = list(warnings)
+    return payload