gwonxhj · hyeokjun32 · May 1, 2026 · May 1, 2026
diff --git a/README.md b/README.md
@@ -188,6 +188,17 @@ CLI / API → Service Layer → Structured Result → Compare / Report
 
 ---
 
+## Contract-Based Validation
+
+InferEdgeLab treats model evaluation as a **contract/preset-based validation workflow**, not as a claim that any arbitrary model can be automatically scored without context.
+`evaluate-detection` now supports the `yolov8_coco` preset, optional `model_contract.json`, COCO annotations, YOLO txt labels, structural detection-output validation, and JSON/Markdown/HTML evaluation reports.
+When annotations are not provided, accuracy is explicitly marked as `skipped` and the report records structural validation only.
+
+Planned presets such as `resnet_imagenet` and `custom_contract` keep future evaluation work scoped to explicit model contracts and dataset assumptions.
+Small normal/problem contract fixtures live under `examples/validation_demo/`.
+
+---
+
 ## Key Results (Real Hardware Validation)
 
 InferEdgeLab was validated on real edge hardware using YOLOv8 models.

diff --git a/examples/validation_demo/coco_minimal_annotations.json b/examples/validation_demo/coco_minimal_annotations.json
@@ -0,0 +1,26 @@
+{
+  "images": [
+    {
+      "id": 1,
+      "file_name": "sample.jpg",
+      "width": 640,
+      "height": 480
+    }
+  ],
+  "categories": [
+    {
+      "id": 1,
+      "name": "person"
+    }
+  ],
+  "annotations": [
+    {
+      "id": 1,
+      "image_id": 1,
+      "category_id": 1,
+      "bbox": [100.0, 120.0, 80.0, 60.0],
+      "area": 4800.0,
+      "iscrowd": 0
+    }
+  ]
+}
diff --git a/examples/validation_demo/problem_model_contract.json b/examples/validation_demo/problem_model_contract.json
@@ -0,0 +1,24 @@
+{
+  "contract_version": "1",
+  "task": "object_detection",
+  "preset": "yolov8_coco",
+  "input": {
+    "name": "images",
+    "shape": [1, 3, 320, 320],
+    "format": "NCHW_RGB_FLOAT32_0_1"
+  },
+  "output": {
+    "name": "output0",
+    "type": "yolov8_detection",
+    "shape": [1, 84, 8400],
+    "format": "tensor"
+  },
+  "thresholds": {
+    "score": 0.25,
+    "iou": 0.5
+  },
+  "metadata": {
+    "demo_case": "problem",
+    "expected_issue": "Input shape intentionally differs from the YOLOv8 COCO preset input size."
+  }
+}
diff --git a/examples/validation_demo/yolov8_coco_model_contract.json b/examples/validation_demo/yolov8_coco_model_contract.json
@@ -0,0 +1,26 @@
+{
+  "contract_version": "1",
+  "task": "object_detection",
+  "preset": "yolov8_coco",
+  "input": {
+    "name": "images",
+    "shape": [1, 3, 640, 640],
+    "format": "NCHW_RGB_FLOAT32_0_1",
+    "dtype": "float32"
+  },
+  "output": {
+    "name": "output0",
+    "type": "yolov8_detection",
+    "shape": [1, 84, 8400],
+    "format": "tensor",
+    "dtype": "float32"
+  },
+  "thresholds": {
+    "score": 0.25,
+    "iou": 0.5
+  },
+  "metadata": {
+    "demo_case": "normal",
+    "note": "Small contract fixture for contract/preset validation demos."
+  }
+}
diff --git a/inferedgelab/commands/evaluate_detection.py b/inferedgelab/commands/evaluate_detection.py
@@ -19,19 +19,36 @@
 from inferedgelab.result.saver import save_result
 from inferedgelab.result.schema import BenchmarkResult
 from inferedgelab.utils.system_info import collect_system_snapshot
+from inferedgelab.validation.model_contract import (
+    ModelContractError,
+    build_default_contract,
+    load_model_contract,
+)
+from inferedgelab.validation.presets import get_preset, supported_presets
+from inferedgelab.validation.report import build_evaluation_report, save_evaluation_report
 
 
 def _exit_with_runtime_error(message: str) -> None:
     rprint(f"[red]{message}[/red]")
     raise typer.Exit(code=1)
 
 
+def _option_string(value: object, default: str = "") -> str:
+    if isinstance(value, str):
+        return value
+    option_default = getattr(value, "default", default)
+    return option_default if isinstance(option_default, str) else default
+
+
 def evaluate_detection_cmd(
     model_path: str = typer.Argument(..., help="평가할 ONNX 모델 경로"),
     engine: str = typer.Option("tensorrt", "--engine", help="추론 엔진 선택"),
     engine_path: str = typer.Option("", "--engine-path", help="Runtime artifact 경로"),
     image_dir: str = typer.Option(..., "--image-dir", help="평가 이미지 디렉토리"),
-    label_dir: str = typer.Option(..., "--label-dir", help="YOLO txt 라벨 디렉토리"),
+    label_dir: str = typer.Option("", "--label-dir", help="YOLO txt 라벨 디렉토리"),
+    coco_annotations: str = typer.Option("", "--coco-annotations", help="COCO annotation JSON 경로"),
+    preset: str = typer.Option("yolov8_coco", "--preset", help="Validation preset 이름"),
+    model_contract: str = typer.Option("", "--model-contract", help="model_contract.json 경로"),
     num_classes: int = typer.Option(1, "--num-classes", help="클래스 수"),
     precision: str = typer.Option("fp16", "--precision", help="precision 메타데이터 (fp32, fp16, int8)"),
     conf_threshold: float = typer.Option(0.2, "--conf-threshold", help="confidence threshold"),
@@ -40,6 +57,9 @@ def evaluate_detection_cmd(
     rgb: bool = typer.Option(True, "--rgb/--bgr", help="Use RGB input conversion after OpenCV read"),
     debug_samples: int = typer.Option(0, "--debug-samples", help="Print internal debug output for the first N images"),
     out_json: str = typer.Option("", "--out-json", help="Accuracy payload 저장 경로"),
+    report_json: str = typer.Option("", "--report-json", help="Evaluation report JSON 저장 경로"),
+    report_md: str = typer.Option("", "--report-md", help="Evaluation report Markdown 저장 경로"),
+    report_html: str = typer.Option("", "--report-html", help="Evaluation report HTML 저장 경로"),
     out_dir: str = typer.Option("results", "--out-dir", help="structured result 저장 디렉토리"),
     save_structured_result: bool = typer.Option(
         True,
@@ -64,6 +84,23 @@ def evaluate_detection_cmd(
 
     if num_classes <= 0:
         raise typer.BadParameter("--num-classes must be >= 1")
+    coco_annotations = _option_string(coco_annotations)
+    preset = _option_string(preset, "yolov8_coco")
+    model_contract = _option_string(model_contract)
+    report_json = _option_string(report_json)
+    report_md = _option_string(report_md)
+    report_html = _option_string(report_html)
+    preset = preset.strip().lower()
+    try:
+        preset_def = get_preset(preset)
+        contract = (
+            load_model_contract(model_contract.strip(), default_preset=preset)
+            if model_contract.strip()
+            else build_default_contract(preset)
+        )
+    except (ValueError, ModelContractError) as exc:
+        supported = ", ".join(supported_presets())
+        raise typer.BadParameter(f"{exc} Supported presets: {supported}") from exc
     if not isinstance(debug_samples, int):
         debug_samples = int(getattr(debug_samples, "default", 0))
     if debug_samples < 0:
@@ -75,7 +112,8 @@ def evaluate_detection_cmd(
             engine_name=engine,
             engine_path=engine_path.strip() or None,
             image_dir=image_dir,
-            label_dir=label_dir,
+            label_dir=label_dir.strip() or None,
+            coco_annotations=coco_annotations.strip() or None,
             num_classes=num_classes,
             conf_threshold=conf_threshold,
             nms_threshold=nms_threshold,
@@ -114,6 +152,9 @@ def evaluate_detection_cmd(
                 "task": "detection",
                 "engine": engine,
                 "engine_path": engine_path.strip() or None,
+                "preset": preset,
+                "model_contract_path": model_contract.strip() or None,
+                "coco_annotations": coco_annotations.strip() or None,
                 "num_classes": num_classes,
             },
             accuracy=accuracy_payload,
@@ -126,22 +167,47 @@ def evaluate_detection_cmd(
                     "evaluation_config": eval_result.evaluation_config,
                     "engine_path": engine_path.strip() or None,
                     "runtime_artifact_path": eval_result.extra.get("runtime_artifact_path"),
+                    "structural_validation": eval_result.extra.get("structural_validation"),
+                    "accuracy_status": eval_result.extra.get("accuracy_status", "evaluated"),
                 }
             },
         )
         result_path = save_result(structured, out_dir=out_dir)
 
+    evaluation_report = build_evaluation_report(
+        eval_result=eval_result,
+        model_contract=contract,
+        preset=preset_def.to_dict(),
+    )
+    save_evaluation_report(
+        evaluation_report,
+        json_path=report_json,
+        markdown_path=report_md,
+        html_path=report_html,
+    )
+
     rprint(f"Engine          : {eval_result.engine}")
     rprint(f"Images          : {image_dir}")
-    rprint(f"Labels          : {label_dir}")
+    rprint(f"Labels          : {label_dir or '(not provided)'}")
+    rprint(f"COCO annotations: {coco_annotations or '(not provided)'}")
     rprint(f"Samples         : {eval_result.sample_count}")
-    rprint(f"Precision       : {eval_result.metrics['precision']:.4f}")
-    rprint(f"Recall          : {eval_result.metrics['recall']:.4f}")
-    rprint(f"F1 Score        : {eval_result.metrics['f1_score']:.4f}")
-    rprint(f"mAP@50          : {eval_result.metrics['map50']:.4f}")
-    rprint(f"mAP@50-95       : {eval_result.metrics['map50_95']:.4f}")
+    rprint(f"Accuracy status : {eval_result.extra.get('accuracy_status', 'evaluated')}")
+    if eval_result.extra.get("accuracy_status") == "skipped":
+        rprint(f"Accuracy skipped: {eval_result.extra.get('accuracy_skip_reason')}")
+    else:
+        rprint(f"Precision       : {eval_result.metrics['precision']:.4f}")
+        rprint(f"Recall          : {eval_result.metrics['recall']:.4f}")
+        rprint(f"F1 Score        : {eval_result.metrics['f1_score']:.4f}")
+        rprint(f"mAP@50          : {eval_result.metrics['map50']:.4f}")
+        rprint(f"mAP@50-95       : {eval_result.metrics['map50_95']:.4f}")
 
     if saved_json_path:
         rprint(f"[cyan]Saved accuracy[/cyan]  : {saved_json_path}")
+    if report_json.strip():
+        rprint(f"[cyan]Saved evaluation JSON[/cyan]: {report_json}")
+    if report_md.strip():
+        rprint(f"[cyan]Saved evaluation Markdown[/cyan]: {report_md}")
+    if report_html.strip():
+        rprint(f"[cyan]Saved evaluation HTML[/cyan]: {report_html}")
     if result_path:
         rprint(f"[cyan]Saved structured result[/cyan]: {result_path}")