diff --git a/README.md b/README.md index 648cf27..de7ca9d 100644 --- a/README.md +++ b/README.md @@ -188,6 +188,17 @@ CLI / API → Service Layer → Structured Result → Compare / Report --- +## Contract-Based Validation + +InferEdgeLab treats model evaluation as a **contract/preset-based validation workflow**, not as a claim that any arbitrary model can be automatically scored without context. +`evaluate-detection` now supports the `yolov8_coco` preset, optional `model_contract.json`, COCO annotations, YOLO txt labels, structural detection-output validation, and JSON/Markdown/HTML evaluation reports. +When annotations are not provided, accuracy is explicitly marked as `skipped` and the report records structural validation only. + +Planned presets such as `resnet_imagenet` and `custom_contract` keep future evaluation work scoped to explicit model contracts and dataset assumptions. +Small normal/problem contract fixtures live under `examples/validation_demo/`. + +--- + ## Key Results (Real Hardware Validation) InferEdgeLab was validated on real edge hardware using YOLOv8 models. diff --git a/examples/validation_demo/coco_minimal_annotations.json b/examples/validation_demo/coco_minimal_annotations.json new file mode 100644 index 0000000..7bf04bc --- /dev/null +++ b/examples/validation_demo/coco_minimal_annotations.json @@ -0,0 +1,26 @@ +{ + "images": [ + { + "id": 1, + "file_name": "sample.jpg", + "width": 640, + "height": 480 + } + ], + "categories": [ + { + "id": 1, + "name": "person" + } + ], + "annotations": [ + { + "id": 1, + "image_id": 1, + "category_id": 1, + "bbox": [100.0, 120.0, 80.0, 60.0], + "area": 4800.0, + "iscrowd": 0 + } + ] +} diff --git a/examples/validation_demo/problem_model_contract.json b/examples/validation_demo/problem_model_contract.json new file mode 100644 index 0000000..e98b96c --- /dev/null +++ b/examples/validation_demo/problem_model_contract.json @@ -0,0 +1,24 @@ +{ + "contract_version": "1", + "task": "object_detection", + "preset": "yolov8_coco", + "input": { + "name": "images", + "shape": [1, 3, 320, 320], + "format": "NCHW_RGB_FLOAT32_0_1" + }, + "output": { + "name": "output0", + "type": "yolov8_detection", + "shape": [1, 84, 8400], + "format": "tensor" + }, + "thresholds": { + "score": 0.25, + "iou": 0.5 + }, + "metadata": { + "demo_case": "problem", + "expected_issue": "Input shape intentionally differs from the YOLOv8 COCO preset input size." + } +} diff --git a/examples/validation_demo/yolov8_coco_model_contract.json b/examples/validation_demo/yolov8_coco_model_contract.json new file mode 100644 index 0000000..5162ba7 --- /dev/null +++ b/examples/validation_demo/yolov8_coco_model_contract.json @@ -0,0 +1,26 @@ +{ + "contract_version": "1", + "task": "object_detection", + "preset": "yolov8_coco", + "input": { + "name": "images", + "shape": [1, 3, 640, 640], + "format": "NCHW_RGB_FLOAT32_0_1", + "dtype": "float32" + }, + "output": { + "name": "output0", + "type": "yolov8_detection", + "shape": [1, 84, 8400], + "format": "tensor", + "dtype": "float32" + }, + "thresholds": { + "score": 0.25, + "iou": 0.5 + }, + "metadata": { + "demo_case": "normal", + "note": "Small contract fixture for contract/preset validation demos." + } +} diff --git a/inferedgelab/commands/evaluate_detection.py b/inferedgelab/commands/evaluate_detection.py index dc4eca0..73b0379 100644 --- a/inferedgelab/commands/evaluate_detection.py +++ b/inferedgelab/commands/evaluate_detection.py @@ -19,6 +19,13 @@ from inferedgelab.result.saver import save_result from inferedgelab.result.schema import BenchmarkResult from inferedgelab.utils.system_info import collect_system_snapshot +from inferedgelab.validation.model_contract import ( + ModelContractError, + build_default_contract, + load_model_contract, +) +from inferedgelab.validation.presets import get_preset, supported_presets +from inferedgelab.validation.report import build_evaluation_report, save_evaluation_report def _exit_with_runtime_error(message: str) -> None: @@ -26,12 +33,22 @@ def _exit_with_runtime_error(message: str) -> None: raise typer.Exit(code=1) +def _option_string(value: object, default: str = "") -> str: + if isinstance(value, str): + return value + option_default = getattr(value, "default", default) + return option_default if isinstance(option_default, str) else default + + def evaluate_detection_cmd( model_path: str = typer.Argument(..., help="평가할 ONNX 모델 경로"), engine: str = typer.Option("tensorrt", "--engine", help="추론 엔진 선택"), engine_path: str = typer.Option("", "--engine-path", help="Runtime artifact 경로"), image_dir: str = typer.Option(..., "--image-dir", help="평가 이미지 디렉토리"), - label_dir: str = typer.Option(..., "--label-dir", help="YOLO txt 라벨 디렉토리"), + label_dir: str = typer.Option("", "--label-dir", help="YOLO txt 라벨 디렉토리"), + coco_annotations: str = typer.Option("", "--coco-annotations", help="COCO annotation JSON 경로"), + preset: str = typer.Option("yolov8_coco", "--preset", help="Validation preset 이름"), + model_contract: str = typer.Option("", "--model-contract", help="model_contract.json 경로"), num_classes: int = typer.Option(1, "--num-classes", help="클래스 수"), precision: str = typer.Option("fp16", "--precision", help="precision 메타데이터 (fp32, fp16, int8)"), conf_threshold: float = typer.Option(0.2, "--conf-threshold", help="confidence threshold"), @@ -40,6 +57,9 @@ def evaluate_detection_cmd( rgb: bool = typer.Option(True, "--rgb/--bgr", help="Use RGB input conversion after OpenCV read"), debug_samples: int = typer.Option(0, "--debug-samples", help="Print internal debug output for the first N images"), out_json: str = typer.Option("", "--out-json", help="Accuracy payload 저장 경로"), + report_json: str = typer.Option("", "--report-json", help="Evaluation report JSON 저장 경로"), + report_md: str = typer.Option("", "--report-md", help="Evaluation report Markdown 저장 경로"), + report_html: str = typer.Option("", "--report-html", help="Evaluation report HTML 저장 경로"), out_dir: str = typer.Option("results", "--out-dir", help="structured result 저장 디렉토리"), save_structured_result: bool = typer.Option( True, @@ -64,6 +84,23 @@ def evaluate_detection_cmd( if num_classes <= 0: raise typer.BadParameter("--num-classes must be >= 1") + coco_annotations = _option_string(coco_annotations) + preset = _option_string(preset, "yolov8_coco") + model_contract = _option_string(model_contract) + report_json = _option_string(report_json) + report_md = _option_string(report_md) + report_html = _option_string(report_html) + preset = preset.strip().lower() + try: + preset_def = get_preset(preset) + contract = ( + load_model_contract(model_contract.strip(), default_preset=preset) + if model_contract.strip() + else build_default_contract(preset) + ) + except (ValueError, ModelContractError) as exc: + supported = ", ".join(supported_presets()) + raise typer.BadParameter(f"{exc} Supported presets: {supported}") from exc if not isinstance(debug_samples, int): debug_samples = int(getattr(debug_samples, "default", 0)) if debug_samples < 0: @@ -75,7 +112,8 @@ def evaluate_detection_cmd( engine_name=engine, engine_path=engine_path.strip() or None, image_dir=image_dir, - label_dir=label_dir, + label_dir=label_dir.strip() or None, + coco_annotations=coco_annotations.strip() or None, num_classes=num_classes, conf_threshold=conf_threshold, nms_threshold=nms_threshold, @@ -114,6 +152,9 @@ def evaluate_detection_cmd( "task": "detection", "engine": engine, "engine_path": engine_path.strip() or None, + "preset": preset, + "model_contract_path": model_contract.strip() or None, + "coco_annotations": coco_annotations.strip() or None, "num_classes": num_classes, }, accuracy=accuracy_payload, @@ -126,22 +167,47 @@ def evaluate_detection_cmd( "evaluation_config": eval_result.evaluation_config, "engine_path": engine_path.strip() or None, "runtime_artifact_path": eval_result.extra.get("runtime_artifact_path"), + "structural_validation": eval_result.extra.get("structural_validation"), + "accuracy_status": eval_result.extra.get("accuracy_status", "evaluated"), } }, ) result_path = save_result(structured, out_dir=out_dir) + evaluation_report = build_evaluation_report( + eval_result=eval_result, + model_contract=contract, + preset=preset_def.to_dict(), + ) + save_evaluation_report( + evaluation_report, + json_path=report_json, + markdown_path=report_md, + html_path=report_html, + ) + rprint(f"Engine : {eval_result.engine}") rprint(f"Images : {image_dir}") - rprint(f"Labels : {label_dir}") + rprint(f"Labels : {label_dir or '(not provided)'}") + rprint(f"COCO annotations: {coco_annotations or '(not provided)'}") rprint(f"Samples : {eval_result.sample_count}") - rprint(f"Precision : {eval_result.metrics['precision']:.4f}") - rprint(f"Recall : {eval_result.metrics['recall']:.4f}") - rprint(f"F1 Score : {eval_result.metrics['f1_score']:.4f}") - rprint(f"mAP@50 : {eval_result.metrics['map50']:.4f}") - rprint(f"mAP@50-95 : {eval_result.metrics['map50_95']:.4f}") + rprint(f"Accuracy status : {eval_result.extra.get('accuracy_status', 'evaluated')}") + if eval_result.extra.get("accuracy_status") == "skipped": + rprint(f"Accuracy skipped: {eval_result.extra.get('accuracy_skip_reason')}") + else: + rprint(f"Precision : {eval_result.metrics['precision']:.4f}") + rprint(f"Recall : {eval_result.metrics['recall']:.4f}") + rprint(f"F1 Score : {eval_result.metrics['f1_score']:.4f}") + rprint(f"mAP@50 : {eval_result.metrics['map50']:.4f}") + rprint(f"mAP@50-95 : {eval_result.metrics['map50_95']:.4f}") if saved_json_path: rprint(f"[cyan]Saved accuracy[/cyan] : {saved_json_path}") + if report_json.strip(): + rprint(f"[cyan]Saved evaluation JSON[/cyan]: {report_json}") + if report_md.strip(): + rprint(f"[cyan]Saved evaluation Markdown[/cyan]: {report_md}") + if report_html.strip(): + rprint(f"[cyan]Saved evaluation HTML[/cyan]: {report_html}") if result_path: rprint(f"[cyan]Saved structured result[/cyan]: {result_path}") diff --git a/inferedgelab/core/detection_evaluator.py b/inferedgelab/core/detection_evaluator.py index 834f048..9ade7b5 100644 --- a/inferedgelab/core/detection_evaluator.py +++ b/inferedgelab/core/detection_evaluator.py @@ -10,6 +10,8 @@ from inferedgelab.engines.base import EngineModelIO from inferedgelab.engines.registry import create_engine, normalize_engine_name +from inferedgelab.validation.coco import load_coco_ground_truths +from inferedgelab.validation.structural import validate_detection_structure @dataclass @@ -750,9 +752,11 @@ def compute_precision_recall_f1( def build_accuracy_payload(eval_result: DetectionEvalResult) -> dict[str, Any]: return { "task": "detection", + "status": eval_result.extra.get("accuracy_status", "evaluated"), "metrics": dict(eval_result.metrics), "dataset": dict(eval_result.dataset), "evaluation_config": dict(eval_result.evaluation_config), + "notes": list(eval_result.notes), } @@ -797,7 +801,8 @@ def evaluate_detection_engine( engine_name: str, engine_path: str | None, image_dir: str, - label_dir: str, + label_dir: str | None = None, + coco_annotations: str | None = None, num_classes: int = 1, conf_threshold: float = 0.2, nms_threshold: float = 0.45, @@ -821,6 +826,14 @@ def evaluate_detection_engine( model_input = engine.inputs[0] image_files = get_image_files(image_dir) + coco_ground_truths = load_coco_ground_truths(coco_annotations) if coco_annotations else {} + accuracy_status = "evaluated" if label_dir or coco_annotations else "skipped" + accuracy_skip_reason = "" + if accuracy_status == "skipped": + accuracy_skip_reason = ( + "No YOLO label directory or COCO annotation file was provided; " + "only output structure was validated." + ) predictions_by_image: list[list[Detection]] = [] ground_truths_by_image: list[list[GroundTruth]] = [] @@ -858,12 +871,19 @@ def evaluate_detection_engine( debug=postprocess_debug, ) - label_path = os.path.join(label_dir, f"{Path(image_path).stem}.txt") - ground_truths = load_ground_truth( - label_path, - image_width=original_width, - image_height=original_height, - ) + ground_truths: list[GroundTruth] = [] + if label_dir: + label_path = os.path.join(label_dir, f"{Path(image_path).stem}.txt") + ground_truths = load_ground_truth( + label_path, + image_width=original_width, + image_height=original_height, + ) + elif coco_annotations: + ground_truths = [ + GroundTruth(class_id=item.class_id, box=item.box) + for item in coco_ground_truths.get(Path(image_path).name, []) + ] predictions_by_image.append(detections) ground_truths_by_image.append(ground_truths) @@ -886,32 +906,40 @@ def evaluate_detection_engine( ) ) - precision, recall, f1_score = compute_precision_recall_f1( + structural_validation = validate_detection_structure( predictions_by_image, - ground_truths_by_image, num_classes=num_classes, - iou_threshold=iou_threshold, ) - map50 = compute_average_precision( - predictions_by_image, - ground_truths_by_image, - num_classes=num_classes, - iou_threshold=0.5, - ) - map_thresholds = np.arange(0.5, 1.0, 0.05) - map50_95 = float( - np.mean( - [ - compute_average_precision( - predictions_by_image, - ground_truths_by_image, - num_classes=num_classes, - iou_threshold=float(threshold), - ) - for threshold in map_thresholds - ] + + if accuracy_status == "evaluated": + precision, recall, f1_score = compute_precision_recall_f1( + predictions_by_image, + ground_truths_by_image, + num_classes=num_classes, + iou_threshold=iou_threshold, ) - ) + map50 = compute_average_precision( + predictions_by_image, + ground_truths_by_image, + num_classes=num_classes, + iou_threshold=0.5, + ) + map_thresholds = np.arange(0.5, 1.0, 0.05) + map50_95 = float( + np.mean( + [ + compute_average_precision( + predictions_by_image, + ground_truths_by_image, + num_classes=num_classes, + iou_threshold=float(threshold), + ) + for threshold in map_thresholds + ] + ) + ) + else: + precision = recall = f1_score = map50 = map50_95 = 0.0 return DetectionEvalResult( task="detection", @@ -926,9 +954,10 @@ def evaluate_detection_engine( "recall": recall, }, notes=[ - "Detection evaluation uses YOLO txt labels and image directory traversal.", + "Detection evaluation uses image directory traversal.", "YOLOv8 postprocessing supports single-output and split boxes/scores output layouts.", - "Primary detection accuracy metric for compare/enrich reuse is map50.", + "Accuracy uses YOLO txt labels or COCO annotations when provided.", + "When annotations are missing, InferEdge records accuracy_skipped and structural validation only.", ], model_input={ "name": model_input.name, @@ -939,7 +968,9 @@ def evaluate_detection_engine( dataset={ "image_dir": image_dir, "label_dir": label_dir, + "coco_annotations": coco_annotations, "sample_count": len(image_files), + "accuracy_status": accuracy_status, }, evaluation_config={ "conf_threshold": conf_threshold, @@ -953,6 +984,9 @@ def evaluate_detection_engine( "engine_path": engine_path, "runtime_artifact_path": getattr(engine.runtime_paths, "runtime_artifact_path", None), "image_files": image_files, + "accuracy_status": accuracy_status, + "accuracy_skip_reason": accuracy_skip_reason, + "structural_validation": structural_validation, }, ) finally: diff --git a/inferedgelab/validation/__init__.py b/inferedgelab/validation/__init__.py new file mode 100644 index 0000000..08ce242 --- /dev/null +++ b/inferedgelab/validation/__init__.py @@ -0,0 +1,14 @@ +"""Validation contract, preset, and report helpers for InferEdgeLab.""" + +from inferedgelab.validation.model_contract import ModelContract, load_model_contract, parse_model_contract +from inferedgelab.validation.presets import get_preset, supported_presets +from inferedgelab.validation.report import build_evaluation_report + +__all__ = [ + "ModelContract", + "build_evaluation_report", + "get_preset", + "load_model_contract", + "parse_model_contract", + "supported_presets", +] diff --git a/inferedgelab/validation/coco.py b/inferedgelab/validation/coco.py new file mode 100644 index 0000000..a17f9ec --- /dev/null +++ b/inferedgelab/validation/coco.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from dataclasses import dataclass +import json +from pathlib import Path +from typing import Any + + +@dataclass(frozen=True) +class CocoGroundTruth: + image_id: int + file_name: str + class_id: int + box: tuple[float, float, float, float] + + +def load_coco_ground_truths(path: str) -> dict[str, list[CocoGroundTruth]]: + payload = json.loads(Path(path).read_text(encoding="utf-8")) + if not isinstance(payload, dict): + raise ValueError("COCO annotations must be a JSON object.") + + images = payload.get("images") + annotations = payload.get("annotations") + categories = payload.get("categories", []) + if not isinstance(images, list) or not isinstance(annotations, list): + raise ValueError("COCO annotations require images and annotations arrays.") + + image_by_id: dict[int, str] = {} + for image in images: + if not isinstance(image, dict): + continue + image_id = int(image["id"]) + image_by_id[image_id] = str(image["file_name"]) + + category_to_class = _category_to_zero_based_class(categories) + result: dict[str, list[CocoGroundTruth]] = {} + for annotation in annotations: + item = _parse_annotation(annotation, image_by_id=image_by_id, category_to_class=category_to_class) + if item is None: + continue + result.setdefault(Path(item.file_name).name, []).append(item) + return result + + +def _category_to_zero_based_class(categories: Any) -> dict[int, int]: + if not isinstance(categories, list) or not categories: + return {} + ids = sorted(int(category["id"]) for category in categories if isinstance(category, dict) and "id" in category) + return {category_id: index for index, category_id in enumerate(ids)} + + +def _parse_annotation( + annotation: Any, + *, + image_by_id: dict[int, str], + category_to_class: dict[int, int], +) -> CocoGroundTruth | None: + if not isinstance(annotation, dict): + return None + if annotation.get("iscrowd", 0): + return None + + image_id = int(annotation["image_id"]) + file_name = image_by_id.get(image_id) + if not file_name: + return None + + bbox = annotation.get("bbox") + if not isinstance(bbox, list) or len(bbox) != 4: + return None + x, y, width, height = (float(value) for value in bbox) + category_id = int(annotation["category_id"]) + class_id = category_to_class.get(category_id, category_id - 1) + return CocoGroundTruth( + image_id=image_id, + file_name=file_name, + class_id=class_id, + box=(x + width / 2.0, y + height / 2.0, width, height), + ) diff --git a/inferedgelab/validation/model_contract.py b/inferedgelab/validation/model_contract.py new file mode 100644 index 0000000..d37ff3d --- /dev/null +++ b/inferedgelab/validation/model_contract.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +from dataclasses import asdict, dataclass +import json +from pathlib import Path +from typing import Any + +from inferedgelab.validation.presets import get_preset + + +class ModelContractError(ValueError): + pass + + +@dataclass(frozen=True) +class ModelContractIO: + shape: list[int] + format: str + name: str | None = None + dtype: str | None = None + type: str | None = None + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + +@dataclass(frozen=True) +class ModelContract: + contract_version: str + task: str + preset: str + labels: list[str] + input: ModelContractIO + output: ModelContractIO + thresholds: dict[str, float] + metadata: dict[str, Any] + + def to_dict(self) -> dict[str, Any]: + return { + "contract_version": self.contract_version, + "task": self.task, + "preset": self.preset, + "labels": list(self.labels), + "input": self.input.to_dict(), + "output": self.output.to_dict(), + "thresholds": dict(self.thresholds), + "metadata": dict(self.metadata), + } + + +def build_default_contract(preset_name: str = "yolov8_coco") -> ModelContract: + preset = get_preset(preset_name) + return ModelContract( + contract_version="1", + task=preset.task, + preset=preset.name, + labels=list(preset.labels), + input=ModelContractIO(shape=list(preset.input_shape), format=preset.input_format), + output=ModelContractIO( + shape=list(preset.output_shape), + format="tensor", + type=preset.output_type, + ), + thresholds=dict(preset.thresholds), + metadata={"source": "preset", "description": preset.description}, + ) + + +def parse_model_contract(payload: dict[str, Any], *, default_preset: str = "yolov8_coco") -> ModelContract: + if not isinstance(payload, dict): + raise ModelContractError("model_contract payload must be a JSON object.") + + preset_name = str(payload.get("preset") or default_preset).strip().lower() + preset = get_preset(preset_name) + default_contract = build_default_contract(preset.name) + + input_payload = payload.get("input") or {} + output_payload = payload.get("output") or {} + if not isinstance(input_payload, dict): + raise ModelContractError("model_contract.input must be an object.") + if not isinstance(output_payload, dict): + raise ModelContractError("model_contract.output must be an object.") + + labels = payload.get("labels", default_contract.labels) + if labels is None: + labels = [] + if not isinstance(labels, list) or not all(isinstance(label, str) for label in labels): + raise ModelContractError("model_contract.labels must be a list of strings.") + + thresholds = payload.get("thresholds", default_contract.thresholds) + if not isinstance(thresholds, dict): + raise ModelContractError("model_contract.thresholds must be an object.") + + input_shape = input_payload.get("shape", default_contract.input.shape) + output_shape = output_payload.get("shape", default_contract.output.shape) + _validate_shape(input_shape, "input.shape") + _validate_shape(output_shape, "output.shape") + + task = str(payload.get("task") or preset.task) + if preset.name != "custom_contract" and task != preset.task: + raise ModelContractError(f"model_contract.task '{task}' does not match preset '{preset.name}'.") + + return ModelContract( + contract_version=str(payload.get("contract_version") or default_contract.contract_version), + task=task, + preset=preset.name, + labels=list(labels), + input=ModelContractIO( + shape=[int(value) for value in input_shape], + format=str(input_payload.get("format") or default_contract.input.format), + name=_optional_string(input_payload.get("name")), + dtype=_optional_string(input_payload.get("dtype")), + ), + output=ModelContractIO( + shape=[int(value) for value in output_shape], + format=str(output_payload.get("format") or default_contract.output.format), + name=_optional_string(output_payload.get("name")), + dtype=_optional_string(output_payload.get("dtype")), + type=str(output_payload.get("type") or default_contract.output.type), + ), + thresholds={str(key): float(value) for key, value in thresholds.items()}, + metadata=dict(payload.get("metadata") or {}), + ) + + +def load_model_contract(path: str, *, default_preset: str = "yolov8_coco") -> ModelContract: + contract_path = Path(path) + try: + payload = json.loads(contract_path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise ModelContractError(f"model_contract is not valid JSON: {path}") from exc + return parse_model_contract(payload, default_preset=default_preset) + + +def _validate_shape(shape: Any, field_name: str) -> None: + if not isinstance(shape, list) or not shape: + raise ModelContractError(f"model_contract.{field_name} must be a non-empty list.") + if not all(isinstance(value, int) and value > 0 for value in shape): + raise ModelContractError(f"model_contract.{field_name} must contain positive integers.") + + +def _optional_string(value: Any) -> str | None: + if value is None: + return None + text = str(value).strip() + return text or None diff --git a/inferedgelab/validation/presets.py b/inferedgelab/validation/presets.py new file mode 100644 index 0000000..e9e5cf4 --- /dev/null +++ b/inferedgelab/validation/presets.py @@ -0,0 +1,169 @@ +from __future__ import annotations + +from dataclasses import asdict, dataclass +from typing import Any + + +@dataclass(frozen=True) +class ValidationPreset: + name: str + task: str + description: str + input_shape: list[int] + input_format: str + output_type: str + output_shape: list[int] + labels: list[str] + thresholds: dict[str, float] + accuracy: dict[str, Any] + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + +COCO80_LABELS = [ + "person", + "bicycle", + "car", + "motorcycle", + "airplane", + "bus", + "train", + "truck", + "boat", + "traffic light", + "fire hydrant", + "stop sign", + "parking meter", + "bench", + "bird", + "cat", + "dog", + "horse", + "sheep", + "cow", + "elephant", + "bear", + "zebra", + "giraffe", + "backpack", + "umbrella", + "handbag", + "tie", + "suitcase", + "frisbee", + "skis", + "snowboard", + "sports ball", + "kite", + "baseball bat", + "baseball glove", + "skateboard", + "surfboard", + "tennis racket", + "bottle", + "wine glass", + "cup", + "fork", + "knife", + "spoon", + "bowl", + "banana", + "apple", + "sandwich", + "orange", + "broccoli", + "carrot", + "hot dog", + "pizza", + "donut", + "cake", + "chair", + "couch", + "potted plant", + "bed", + "dining table", + "toilet", + "tv", + "laptop", + "mouse", + "remote", + "keyboard", + "cell phone", + "microwave", + "oven", + "toaster", + "sink", + "refrigerator", + "book", + "clock", + "vase", + "scissors", + "teddy bear", + "hair drier", + "toothbrush", +] + + +_PRESETS: dict[str, ValidationPreset] = { + "yolov8_coco": ValidationPreset( + name="yolov8_coco", + task="object_detection", + description="YOLOv8 object detection on COCO-style labels.", + input_shape=[1, 3, 640, 640], + input_format="NCHW_RGB_FLOAT32_0_1", + output_type="yolov8_detection", + output_shape=[1, 84, 8400], + labels=COCO80_LABELS, + thresholds={"score": 0.25, "iou": 0.5}, + accuracy={ + "primary_metric": "map50", + "secondary_metrics": ["precision", "recall", "f1_score", "map50_95"], + "annotation_formats": ["coco", "yolo_txt"], + }, + ), + "resnet_imagenet": ValidationPreset( + name="resnet_imagenet", + task="classification", + description="ImageNet classification contract placeholder.", + input_shape=[1, 3, 224, 224], + input_format="NCHW_RGB_FLOAT32_0_1", + output_type="classification_logits", + output_shape=[1, 1000], + labels=[], + thresholds={"top1_min": 0.0, "top5_min": 0.0}, + accuracy={ + "primary_metric": "top1", + "secondary_metrics": ["top5"], + "annotation_formats": ["imagenet_folder", "custom_contract"], + }, + ), + "custom_contract": ValidationPreset( + name="custom_contract", + task="custom", + description="Custom validation requires an explicit model_contract.json.", + input_shape=[], + input_format="custom", + output_type="custom", + output_shape=[], + labels=[], + thresholds={}, + accuracy={ + "primary_metric": "contract_defined", + "secondary_metrics": [], + "annotation_formats": ["custom_contract"], + }, + ), +} + + +def supported_presets() -> list[str]: + return sorted(_PRESETS) + + +def get_preset(name: str) -> ValidationPreset: + key = name.strip().lower() + if key not in _PRESETS: + supported = ", ".join(supported_presets()) + raise ValueError(f"Unsupported validation preset: {name}. Supported presets: {supported}") + return _PRESETS[key] diff --git a/inferedgelab/validation/report.py b/inferedgelab/validation/report.py new file mode 100644 index 0000000..693f672 --- /dev/null +++ b/inferedgelab/validation/report.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +from datetime import datetime +import html +import json +from pathlib import Path +from typing import Any + +from inferedgelab.validation.model_contract import ModelContract +from inferedgelab.validation.structural import validate_shape + + +def build_evaluation_report( + *, + eval_result: Any, + model_contract: ModelContract, + preset: dict[str, Any], + latency_summary: dict[str, Any] | None = None, +) -> dict[str, Any]: + accuracy_status = str(eval_result.extra.get("accuracy_status") or "evaluated") + structural_validation = dict(eval_result.extra.get("structural_validation") or {}) + contract_validation = { + "input_shape": validate_shape(eval_result.actual_input_shape, model_contract.input.shape) + if eval_result.actual_input_shape and model_contract.input.shape + else {"status": "not_checked"}, + "preset": model_contract.preset, + "task": model_contract.task, + } + return { + "report_role": "inferedge-evaluation-report", + "generated_at": datetime.utcnow().isoformat(timespec="seconds") + "Z", + "preset": preset, + "model_contract": model_contract.to_dict(), + "runtime_result": { + "engine": eval_result.engine, + "device": eval_result.device, + "sample_count": eval_result.sample_count, + "model_input": eval_result.model_input, + "actual_input_shape": eval_result.actual_input_shape, + }, + "accuracy": { + "status": accuracy_status, + "metrics": dict(eval_result.metrics), + "reason": eval_result.extra.get("accuracy_skip_reason") if accuracy_status == "skipped" else None, + }, + "contract_validation": contract_validation, + "structural_validation": structural_validation, + "latency_summary": latency_summary or {"status": "not_provided"}, + "deployment_signal": _deployment_signal(accuracy_status, structural_validation, contract_validation), + "notes": list(eval_result.notes), + } + + +def save_evaluation_report(report: dict[str, Any], *, json_path: str = "", markdown_path: str = "", html_path: str = "") -> None: + if json_path.strip(): + _write_text(json_path, json.dumps(report, ensure_ascii=False, indent=2) + "\n") + if markdown_path.strip(): + _write_text(markdown_path, render_evaluation_markdown(report)) + if html_path.strip(): + _write_text(html_path, render_evaluation_html(report)) + + +def render_evaluation_markdown(report: dict[str, Any]) -> str: + accuracy = report["accuracy"] + structural = report.get("structural_validation") or {} + contract_validation = report.get("contract_validation") or {} + input_shape = contract_validation.get("input_shape") or {} + signal = report["deployment_signal"] + lines = [ + "# InferEdge Evaluation Report", + "", + f"- preset: `{report['preset']['name']}`", + f"- engine: `{report['runtime_result']['engine']}`", + f"- device: `{report['runtime_result']['device']}`", + f"- samples: `{report['runtime_result']['sample_count']}`", + f"- accuracy status: `{accuracy['status']}`", + f"- contract input shape: `{input_shape.get('status', 'unknown')}`", + f"- structural validation: `{structural.get('status', 'unknown')}`", + f"- deployment signal: `{signal['decision']}`", + "", + "## Metrics", + ] + if accuracy["status"] == "skipped": + lines.append(f"- accuracy skipped reason: {accuracy.get('reason') or 'not provided'}") + for key, value in accuracy.get("metrics", {}).items(): + lines.append(f"- {key}: `{value}`") + lines.extend(["", "## Notes"]) + for note in report.get("notes", []): + lines.append(f"- {note}") + return "\n".join(lines) + "\n" + + +def render_evaluation_html(report: dict[str, Any]) -> str: + markdown = render_evaluation_markdown(report) + escaped = html.escape(markdown) + return ( + "\n" + "InferEdge Evaluation Report" + "
"
+        f"{escaped}"
+        "
\n" + ) + + +def _deployment_signal( + accuracy_status: str, + structural_validation: dict[str, Any], + contract_validation: dict[str, Any], +) -> dict[str, str]: + if (contract_validation.get("input_shape") or {}).get("status") == "mismatch": + return { + "decision": "blocked", + "reason": "Actual runtime input shape does not match the model contract.", + } + if structural_validation.get("status") == "failed": + return { + "decision": "blocked", + "reason": "Structural validation found invalid detection output.", + } + if accuracy_status == "skipped": + return { + "decision": "review", + "reason": "Accuracy evaluation was skipped because annotations were not provided.", + } + return { + "decision": "review", + "reason": "Accuracy evidence is available; compare and deployment policy still decide release.", + } + + +def _write_text(path: str, text: str) -> None: + out_path = Path(path) + if out_path.parent != Path("."): + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(text, encoding="utf-8") diff --git a/inferedgelab/validation/structural.py b/inferedgelab/validation/structural.py new file mode 100644 index 0000000..3110f17 --- /dev/null +++ b/inferedgelab/validation/structural.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +import math +from typing import Any, Sequence + + +def validate_detection_structure( + detections_by_image: Sequence[Sequence[Any]], + *, + num_classes: int | None = None, +) -> dict[str, Any]: + issues: list[dict[str, Any]] = [] + detection_count = 0 + + for image_index, detections in enumerate(detections_by_image): + for detection_index, detection in enumerate(detections): + detection_count += 1 + class_id = int(getattr(detection, "class_id", -1)) + confidence = float(getattr(detection, "confidence", float("nan"))) + box = tuple(float(value) for value in getattr(detection, "box", ())) + + if num_classes is not None and not 0 <= class_id < num_classes: + issues.append( + _issue(image_index, detection_index, "class_id_out_of_range", class_id) + ) + if not math.isfinite(confidence) or not 0.0 <= confidence <= 1.0: + issues.append(_issue(image_index, detection_index, "score_out_of_range", confidence)) + if len(box) != 4: + issues.append(_issue(image_index, detection_index, "bbox_not_xywh", list(box))) + continue + if not all(math.isfinite(value) for value in box): + issues.append(_issue(image_index, detection_index, "bbox_non_finite", list(box))) + if box[2] <= 0.0 or box[3] <= 0.0: + issues.append(_issue(image_index, detection_index, "bbox_non_positive_size", list(box))) + + return { + "status": "passed" if not issues else "failed", + "checked": { + "image_count": len(detections_by_image), + "detection_count": detection_count, + "num_classes": num_classes, + }, + "issues": issues, + } + + +def validate_shape(actual_shape: Sequence[int], expected_shape: Sequence[int]) -> dict[str, Any]: + actual = [int(value) for value in actual_shape] + expected = [int(value) for value in expected_shape] + return { + "status": "passed" if actual == expected else "mismatch", + "actual_shape": actual, + "expected_shape": expected, + } + + +def _issue(image_index: int, detection_index: int, code: str, value: Any) -> dict[str, Any]: + return { + "image_index": image_index, + "detection_index": detection_index, + "code": code, + "value": value, + } diff --git a/tests/fixtures/validation/coco_minimal.json b/tests/fixtures/validation/coco_minimal.json new file mode 100644 index 0000000..7fc0b4a --- /dev/null +++ b/tests/fixtures/validation/coco_minimal.json @@ -0,0 +1,26 @@ +{ + "images": [ + { + "id": 1, + "file_name": "sample.jpg", + "width": 640, + "height": 480 + } + ], + "categories": [ + { + "id": 1, + "name": "person" + } + ], + "annotations": [ + { + "id": 10, + "image_id": 1, + "category_id": 1, + "bbox": [100.0, 120.0, 80.0, 60.0], + "area": 4800.0, + "iscrowd": 0 + } + ] +} diff --git a/tests/test_evaluate_detection.py b/tests/test_evaluate_detection.py index 750a72a..9adc24e 100644 --- a/tests/test_evaluate_detection.py +++ b/tests/test_evaluate_detection.py @@ -273,6 +273,79 @@ def fake_save_result(result, out_dir="results"): assert captured["engine_kwargs"]["debug_samples"] == 0 +def test_evaluate_detection_command_writes_contract_evaluation_report(tmp_path, monkeypatch): + from inferedgelab.commands import evaluate_detection + + captured = {} + + def fake_evaluate_detection_engine(**kwargs): + captured["engine_kwargs"] = kwargs + return DetectionEvalResult( + task="detection", + engine="onnxruntime", + device="cpu", + sample_count=1, + metrics={ + "map50": 0.0, + "map50_95": 0.0, + "f1_score": 0.0, + "precision": 0.0, + "recall": 0.0, + }, + notes=["structural validation only"], + model_input={"name": "images", "dtype": "float32", "shape": [1, 3, 640, 640]}, + actual_input_shape=[1, 3, 640, 640], + dataset={"image_dir": "images", "label_dir": None, "sample_count": 1}, + evaluation_config={ + "conf_threshold": 0.2, + "nms_threshold": 0.45, + "iou_threshold": 0.5, + "input_size": 640, + "rgb": True, + }, + extra={ + "accuracy_status": "skipped", + "accuracy_skip_reason": "No annotations were provided.", + "structural_validation": {"status": "passed", "issues": []}, + }, + ) + + monkeypatch.setattr(evaluate_detection, "evaluate_detection_engine", fake_evaluate_detection_engine) + + report_json = tmp_path / "evaluation.json" + report_md = tmp_path / "evaluation.md" + evaluate_detection.evaluate_detection_cmd( + model_path="models/onnx/yolov8n.onnx", + engine="onnxruntime", + engine_path="", + image_dir="images", + label_dir="", + coco_annotations="", + preset="yolov8_coco", + model_contract="", + num_classes=80, + precision="fp32", + conf_threshold=0.2, + nms_threshold=0.45, + iou_threshold=0.5, + rgb=True, + debug_samples=0, + out_json="", + report_json=str(report_json), + report_md=str(report_md), + report_html="", + out_dir=str(tmp_path / "results"), + save_structured_result=False, + ) + + report = json.loads(report_json.read_text(encoding="utf-8")) + assert captured["engine_kwargs"]["label_dir"] is None + assert captured["engine_kwargs"]["coco_annotations"] is None + assert report["model_contract"]["preset"] == "yolov8_coco" + assert report["accuracy"]["status"] == "skipped" + assert "accuracy skipped reason" in report_md.read_text(encoding="utf-8") + + def test_evaluate_detection_engine_debug_path_prints_sample_diagnostics(tmp_path, monkeypatch, capsys): image_dir = tmp_path / "images" label_dir = tmp_path / "labels" @@ -365,6 +438,9 @@ def test_evaluate_detection_help_shows_debug_samples_option(): assert result.exit_code == 0 assert "--debug-samples" in result.stdout + assert "--model-contract" in result.stdout + assert "--preset" in result.stdout + assert "--coco-annotations" in result.stdout def test_cli_help_registers_evaluate_detection_command(): diff --git a/tests/test_validation_contracts.py b/tests/test_validation_contracts.py new file mode 100644 index 0000000..70eac92 --- /dev/null +++ b/tests/test_validation_contracts.py @@ -0,0 +1,159 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest + +from inferedgelab.core.detection_evaluator import Detection, DetectionEvalResult +from inferedgelab.validation.coco import load_coco_ground_truths +from inferedgelab.validation.model_contract import ( + ModelContractError, + build_default_contract, + load_model_contract, + parse_model_contract, +) +from inferedgelab.validation.presets import get_preset, supported_presets +from inferedgelab.validation.report import build_evaluation_report, render_evaluation_markdown +from inferedgelab.validation.structural import validate_detection_structure, validate_shape + + +def test_yolov8_coco_preset_builds_default_model_contract(): + preset = get_preset("yolov8_coco") + contract = build_default_contract("yolov8_coco") + + assert "yolov8_coco" in supported_presets() + assert preset.task == "object_detection" + assert contract.preset == "yolov8_coco" + assert contract.input.shape == [1, 3, 640, 640] + assert contract.output.type == "yolov8_detection" + assert len(contract.labels) == 80 + + +def test_parse_model_contract_rejects_preset_task_mismatch(): + with pytest.raises(ModelContractError): + parse_model_contract( + { + "preset": "yolov8_coco", + "task": "classification", + "input": {"shape": [1, 3, 640, 640]}, + "output": {"shape": [1, 84, 8400]}, + } + ) + + +def test_example_validation_demo_contracts_are_parseable(): + repo_root = Path(__file__).resolve().parents[1] + + normal = load_model_contract(str(repo_root / "examples" / "validation_demo" / "yolov8_coco_model_contract.json")) + problem = load_model_contract(str(repo_root / "examples" / "validation_demo" / "problem_model_contract.json")) + + assert normal.metadata["demo_case"] == "normal" + assert problem.metadata["demo_case"] == "problem" + assert problem.input.shape == [1, 3, 320, 320] + + +def test_load_coco_ground_truths_maps_annotations_by_file_name(): + fixture = Path(__file__).parent / "fixtures" / "validation" / "coco_minimal.json" + + ground_truths = load_coco_ground_truths(str(fixture)) + + assert list(ground_truths) == ["sample.jpg"] + assert ground_truths["sample.jpg"][0].class_id == 0 + assert ground_truths["sample.jpg"][0].box == pytest.approx((140.0, 150.0, 80.0, 60.0)) + + +def test_structural_validation_detects_invalid_detection_fields(): + result = validate_detection_structure( + [[Detection(class_id=99, confidence=1.2, box=(10.0, 10.0, -5.0, 5.0))]], + num_classes=3, + ) + + assert result["status"] == "failed" + assert {issue["code"] for issue in result["issues"]} == { + "class_id_out_of_range", + "score_out_of_range", + "bbox_non_positive_size", + } + + +def test_shape_validation_reports_mismatch(): + result = validate_shape([1, 3, 320, 320], [1, 3, 640, 640]) + + assert result["status"] == "mismatch" + + +def test_evaluation_report_marks_missing_annotations_as_accuracy_skipped(): + eval_result = DetectionEvalResult( + task="detection", + engine="onnxruntime", + device="cpu", + sample_count=1, + metrics={ + "map50": 0.0, + "map50_95": 0.0, + "f1_score": 0.0, + "precision": 0.0, + "recall": 0.0, + }, + notes=["structural validation only"], + model_input={"name": "images", "dtype": "float32", "shape": [1, 3, 640, 640]}, + actual_input_shape=[1, 3, 640, 640], + dataset={"image_dir": "images", "sample_count": 1, "accuracy_status": "skipped"}, + evaluation_config={"input_size": 640}, + extra={ + "accuracy_status": "skipped", + "accuracy_skip_reason": "annotations missing", + "structural_validation": {"status": "passed", "issues": []}, + }, + ) + + report = build_evaluation_report( + eval_result=eval_result, + model_contract=build_default_contract("yolov8_coco"), + preset=get_preset("yolov8_coco").to_dict(), + ) + markdown = render_evaluation_markdown(report) + + assert report["accuracy"]["status"] == "skipped" + assert report["contract_validation"]["input_shape"]["status"] == "passed" + assert report["deployment_signal"]["decision"] == "review" + assert "accuracy skipped reason" in markdown + + +def test_evaluation_report_blocks_contract_shape_mismatch(): + eval_result = DetectionEvalResult( + task="detection", + engine="onnxruntime", + device="cpu", + sample_count=1, + metrics={ + "map50": 0.0, + "map50_95": 0.0, + "f1_score": 0.0, + "precision": 0.0, + "recall": 0.0, + }, + notes=[], + model_input={"name": "images", "dtype": "float32", "shape": [1, 3, 640, 640]}, + actual_input_shape=[1, 3, 640, 640], + dataset={"image_dir": "images", "sample_count": 1}, + evaluation_config={"input_size": 640}, + extra={"accuracy_status": "skipped", "structural_validation": {"status": "passed", "issues": []}}, + ) + contract = parse_model_contract( + { + "preset": "yolov8_coco", + "task": "object_detection", + "input": {"shape": [1, 3, 320, 320]}, + "output": {"shape": [1, 84, 8400]}, + } + ) + + report = build_evaluation_report( + eval_result=eval_result, + model_contract=contract, + preset=get_preset("yolov8_coco").to_dict(), + ) + + assert report["contract_validation"]["input_shape"]["status"] == "mismatch" + assert report["deployment_signal"]["decision"] == "blocked"