From 08d60d034daf37f8f6b8aaa2776aa9378a39b2a8 Mon Sep 17 00:00:00 2001 From: hyeokjun32 Date: Fri, 1 May 2026 17:32:31 +0900 Subject: [PATCH] feat: add validation problem demo cases --- README.md | 2 + docs/portfolio/inferedge_pipeline_status.md | 2 + .../inferedge_portfolio_submission.md | 1 + docs/portfolio/validation_problem_cases.md | 24 ++++++++ .../validation_demo/problem_cases/README.md | 12 ++++ .../annotation_missing_report.json | 48 +++++++++++++++ .../contract_shape_mismatch_report.json | 48 +++++++++++++++ .../invalid_detection_structure_report.json | 61 +++++++++++++++++++ inferedgelab/studio/routes.py | 43 ++++++++++++- inferedgelab/studio/static/app.js | 35 +++++++++++ inferedgelab/studio/static/index.html | 9 +-- inferedgelab/studio/static/style.css | 30 +++++++++ tests/test_studio_routes.py | 19 ++++-- tests/test_validation_demo_report.py | 22 +++++++ 14 files changed, 347 insertions(+), 9 deletions(-) create mode 100644 docs/portfolio/validation_problem_cases.md create mode 100644 examples/validation_demo/problem_cases/README.md create mode 100644 examples/validation_demo/problem_cases/annotation_missing_report.json create mode 100644 examples/validation_demo/problem_cases/contract_shape_mismatch_report.json create mode 100644 examples/validation_demo/problem_cases/invalid_detection_structure_report.json diff --git a/README.md b/README.md index 091da17..3a7724a 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,7 @@ TensorRT Jetson was 4.6x faster than ONNX Runtime CPU in this real image input b The benchmark uses end-to-end Runtime latency, not trtexec GPU-only latency. The full pipeline portfolio summary is available at [docs/portfolio/inferedge_pipeline_portfolio.md](docs/portfolio/inferedge_pipeline_portfolio.md), and the detailed Runtime comparison report is available at [docs/portfolio/runtime_compare_yolov8n.md](docs/portfolio/runtime_compare_yolov8n.md). The YOLOv8 COCO subset accuracy demo is documented in [docs/portfolio/yolov8_coco_subset_evaluation.md](docs/portfolio/yolov8_coco_subset_evaluation.md). +Validation problem cases are documented in [docs/portfolio/validation_problem_cases.md](docs/portfolio/validation_problem_cases.md). ## Local Studio Demo Evidence @@ -102,6 +103,7 @@ Verified demo fixture values: Studio reports this as a `4.57x` TensorRT speedup for the bundled demo pair. AIGuard remains optional in this local Studio path; if Guard evidence is not loaded, the deployment decision explains that the Lab comparison is available but diagnosis evidence is not provided. The same demo flow also surfaces a small `yolov8_coco` evaluation report summary: 10 images, 89 ground-truth boxes, mAP@50 `0.1410`, precision `0.2941`, recall `0.1685`, structural validation `passed`. +It also includes problem-case summaries for annotation-missing review, invalid detection structure blocking, and contract shape mismatch blocking. --- diff --git a/docs/portfolio/inferedge_pipeline_status.md b/docs/portfolio/inferedge_pipeline_status.md index 05fa078..ae31c7a 100644 --- a/docs/portfolio/inferedge_pipeline_status.md +++ b/docs/portfolio/inferedge_pipeline_status.md @@ -97,6 +97,7 @@ The current cross-repository loop is covered by documentation, fixtures, and smo - Lab deployment decision/report evidence smoke for AIGuard worker provenance diagnosis - Local Studio local-first workflow UI for viewing Forge -> Runtime -> Lab -> optional AIGuard state, creating in-memory analyze jobs, importing Runtime result JSON, replaying bundled demo evidence, comparing backends, and inspecting Lab-owned deployment decision context - YOLOv8 COCO subset evaluation report generated from 10 local images and 89 converted COCO-style person annotations, with mAP@50 0.1410, precision 0.2941, recall 0.1685, and structural validation passed +- Validation problem case fixtures for annotation-missing review, invalid detection structure blocking, and contract shape mismatch blocking This means the current product boundary is testable without running the production worker infrastructure. @@ -127,6 +128,7 @@ Demo readiness: `scripts/demo_pipeline_full.sh` now provides a guided end-to-end - Guided end-to-end demo entrypoint for portfolio and interview walkthroughs - Local Studio at `/studio` for a local-first browser view of Run / Import / Demo Evidence / Compare / Decision / Jetson Helper workflows - Contract/preset validation demo with `yolov8_coco`, COCO annotation loading, simplified accuracy metrics, structural validation, and JSON/Markdown/HTML report fixtures +- Problem-case validation reports that make skipped accuracy, invalid output structure, and contract mismatch visible in Local Studio - Cross-repo fixture compatibility across Forge, Runtime, Lab, and AIGuard - Rule/evidence based provenance mismatch diagnosis diff --git a/docs/portfolio/inferedge_portfolio_submission.md b/docs/portfolio/inferedge_portfolio_submission.md index 08c7563..b8b2653 100644 --- a/docs/portfolio/inferedge_portfolio_submission.md +++ b/docs/portfolio/inferedge_portfolio_submission.md @@ -113,6 +113,7 @@ Recent validation evidence: - Guided demo entrypoint: `scripts/demo_pipeline_full.sh` summarizes the full Forge -> Runtime -> Lab -> optional AIGuard flow and can print the Jetson TensorRT Runtime command without claiming production worker or SaaS readiness. - Local Studio demo evidence: `/studio` can load bundled ONNX Runtime CPU and TensorRT Jetson Runtime result fixtures from `examples/studio_demo`, keep the demo pair selectable in Recent jobs while the local server process is alive, and show TensorRT Jetson vs ONNX Runtime CPU comparison in the browser. The fixture-backed evidence records ONNX Runtime CPU at mean 45.4299 ms / p99 49.2128 ms / 22.0119 FPS and TensorRT Jetson at mean 9.9375 ms / p99 15.5231 ms / 100.6293 FPS, a 4.57x TensorRT speedup for this demo pair. - YOLOv8 COCO subset evaluation: a 10-image local person-detection subset with 89 ground-truth boxes is converted into a COCO-style annotation fixture and evaluated through the `yolov8_coco` preset. The generated report records mAP@50 0.1410, precision 0.2941, recall 0.1685, and structural validation passed. This is documented as subset workflow evidence, not a full COCO benchmark claim. +- Validation problem cases: the demo bundle includes annotation-missing, invalid detection structure, and contract shape mismatch reports. These show that InferEdge records review/block evidence explicitly instead of presenting every validation path as successful. The direct Runtime execution result includes `deployment_decision`. Its `unknown` value is expected before Lab compare/report because the worker response has not yet been compared by Lab. diff --git a/docs/portfolio/validation_problem_cases.md b/docs/portfolio/validation_problem_cases.md new file mode 100644 index 0000000..a59b9f1 --- /dev/null +++ b/docs/portfolio/validation_problem_cases.md @@ -0,0 +1,24 @@ +# Validation Problem Cases + +InferEdge does not hide validation failures. These fixtures show how the Lab evidence layer records cases that need review or should be blocked. + +## Cases + +| Case | Decision Signal | What It Demonstrates | +|---|---|---| +| annotation missing | review | Accuracy is intentionally skipped when annotation evidence is unavailable. | +| invalid detection structure | blocked | Score/bbox structural checks can block malformed detection output. | +| contract shape mismatch | blocked | Runtime input shape must match the declared `model_contract.json`. | + +## Files + +- `examples/validation_demo/problem_cases/annotation_missing_report.json` +- `examples/validation_demo/problem_cases/invalid_detection_structure_report.json` +- `examples/validation_demo/problem_cases/contract_shape_mismatch_report.json` + +## Interpretation + +These are deliberately small report fixtures, not production SaaS records. +They make the portfolio story clearer: InferEdge is a contract/preset validation pipeline, so missing annotations, malformed outputs, and contract mismatches are explicit evidence states. + +Local Studio includes these problem cases in the `Load Demo Evidence` flow so the browser demo can show both the happy path and the review/block paths. diff --git a/examples/validation_demo/problem_cases/README.md b/examples/validation_demo/problem_cases/README.md new file mode 100644 index 0000000..621ae7a --- /dev/null +++ b/examples/validation_demo/problem_cases/README.md @@ -0,0 +1,12 @@ +# Validation Problem Demo Cases + +These fixtures show how InferEdgeLab records uncomfortable validation evidence instead of hiding it. +They are intentionally small JSON reports and do not include raw images. + +| Case | Signal | Reason | +|---|---|---| +| `annotation_missing_report.json` | review | Accuracy is skipped because annotation evidence is not provided. | +| `invalid_detection_structure_report.json` | blocked | Detection output contains invalid score/bbox structure. | +| `contract_shape_mismatch_report.json` | blocked | Runtime input shape does not match `model_contract.json`. | + +These reports are portfolio fixtures, not production SaaS data. diff --git a/examples/validation_demo/problem_cases/annotation_missing_report.json b/examples/validation_demo/problem_cases/annotation_missing_report.json new file mode 100644 index 0000000..9372e67 --- /dev/null +++ b/examples/validation_demo/problem_cases/annotation_missing_report.json @@ -0,0 +1,48 @@ +{ + "report_role": "inferedge-evaluation-report", + "generated_at": "2026-05-01T00:00:00Z", + "problem_case": "annotation_missing", + "preset": { + "name": "yolov8_coco", + "task": "object_detection" + }, + "runtime_result": { + "engine": "onnxruntime", + "device": "cpu", + "sample_count": 10, + "actual_input_shape": [1, 3, 640, 640] + }, + "accuracy": { + "status": "skipped", + "metrics": { + "map50": 0.0, + "map50_95": 0.0, + "f1_score": 0.0, + "precision": 0.0, + "recall": 0.0 + }, + "reason": "annotation_not_provided" + }, + "contract_validation": { + "input_shape": { + "status": "passed", + "actual_shape": [1, 3, 640, 640], + "expected_shape": [1, 3, 640, 640] + }, + "preset": "yolov8_coco", + "task": "object_detection" + }, + "structural_validation": { + "status": "passed", + "checked": { + "image_count": 10, + "detection_count": 51, + "num_classes": 80 + }, + "issues": [] + }, + "deployment_signal": { + "decision": "review", + "reason": "Accuracy evaluation was skipped because annotations were not provided." + } +} diff --git a/examples/validation_demo/problem_cases/contract_shape_mismatch_report.json b/examples/validation_demo/problem_cases/contract_shape_mismatch_report.json new file mode 100644 index 0000000..7f12152 --- /dev/null +++ b/examples/validation_demo/problem_cases/contract_shape_mismatch_report.json @@ -0,0 +1,48 @@ +{ + "report_role": "inferedge-evaluation-report", + "generated_at": "2026-05-01T00:00:00Z", + "problem_case": "contract_shape_mismatch", + "preset": { + "name": "yolov8_coco", + "task": "object_detection" + }, + "runtime_result": { + "engine": "onnxruntime", + "device": "cpu", + "sample_count": 1, + "actual_input_shape": [1, 3, 640, 640] + }, + "accuracy": { + "status": "skipped", + "metrics": { + "map50": 0.0, + "map50_95": 0.0, + "f1_score": 0.0, + "precision": 0.0, + "recall": 0.0 + }, + "reason": "model_contract_input_shape_mismatch" + }, + "contract_validation": { + "input_shape": { + "status": "mismatch", + "actual_shape": [1, 3, 640, 640], + "expected_shape": [1, 3, 320, 320] + }, + "preset": "yolov8_coco", + "task": "object_detection" + }, + "structural_validation": { + "status": "passed", + "checked": { + "image_count": 1, + "detection_count": 0, + "num_classes": 80 + }, + "issues": [] + }, + "deployment_signal": { + "decision": "blocked", + "reason": "Actual runtime input shape does not match the model contract." + } +} diff --git a/examples/validation_demo/problem_cases/invalid_detection_structure_report.json b/examples/validation_demo/problem_cases/invalid_detection_structure_report.json new file mode 100644 index 0000000..3ec76d8 --- /dev/null +++ b/examples/validation_demo/problem_cases/invalid_detection_structure_report.json @@ -0,0 +1,61 @@ +{ + "report_role": "inferedge-evaluation-report", + "generated_at": "2026-05-01T00:00:00Z", + "problem_case": "invalid_detection_structure", + "preset": { + "name": "yolov8_coco", + "task": "object_detection" + }, + "runtime_result": { + "engine": "onnxruntime", + "device": "cpu", + "sample_count": 1, + "actual_input_shape": [1, 3, 640, 640] + }, + "accuracy": { + "status": "skipped", + "metrics": { + "map50": 0.0, + "map50_95": 0.0, + "f1_score": 0.0, + "precision": 0.0, + "recall": 0.0 + }, + "reason": "structural_validation_failed_before_accuracy" + }, + "contract_validation": { + "input_shape": { + "status": "passed", + "actual_shape": [1, 3, 640, 640], + "expected_shape": [1, 3, 640, 640] + }, + "preset": "yolov8_coco", + "task": "object_detection" + }, + "structural_validation": { + "status": "failed", + "checked": { + "image_count": 1, + "detection_count": 1, + "num_classes": 80 + }, + "issues": [ + { + "image_index": 0, + "detection_index": 0, + "code": "score_out_of_range", + "value": 1.42 + }, + { + "image_index": 0, + "detection_index": 0, + "code": "bbox_non_positive_size", + "value": [320.0, 320.0, -12.0, 48.0] + } + ] + }, + "deployment_signal": { + "decision": "blocked", + "reason": "Structural validation found invalid detection output." + } +} diff --git a/inferedgelab/studio/routes.py b/inferedgelab/studio/routes.py index 59196e9..e948463 100644 --- a/inferedgelab/studio/routes.py +++ b/inferedgelab/studio/routes.py @@ -22,11 +22,17 @@ STATIC_DIR = Path(__file__).resolve().parent / "static" DEMO_EVIDENCE_DIR = Path(__file__).resolve().parents[2] / "examples" / "studio_demo" VALIDATION_DEMO_DIR = Path(__file__).resolve().parents[2] / "examples" / "validation_demo" / "subset" +VALIDATION_PROBLEM_DIR = Path(__file__).resolve().parents[2] / "examples" / "validation_demo" / "problem_cases" DEMO_EVIDENCE_FILES = ( "onnxruntime_cpu_result.json", "tensorrt_jetson_result.json", ) DEMO_EVALUATION_REPORT = "yolov8_coco_subset_evaluation.json" +DEMO_PROBLEM_REPORTS = ( + "annotation_missing_report.json", + "invalid_detection_structure_report.json", + "contract_shape_mismatch_report.json", +) DEMO_JOB_ID = "demo_yolov8n_trt_vs_onnx" STATIC_ASSETS = { "app.js": "application/javascript", @@ -159,10 +165,11 @@ def studio_import(request: Request, payload: dict[str, Any] = Body(...)) -> dict def studio_demo_evidence(request: Request) -> dict[str, Any]: results = [_load_demo_result(file_name) for file_name in DEMO_EVIDENCE_FILES] evaluation_report = _load_demo_evaluation_report() + problem_cases = _load_demo_problem_cases() imported_results = _get_imported_results(request) imported_results.extend(results) compare = _build_imported_compare_response(results[0], results[1]) - demo_job = _build_demo_job(results, compare, evaluation_report) + demo_job = _build_demo_job(results, compare, evaluation_report, problem_cases) _get_demo_jobs(request)[DEMO_JOB_ID] = demo_job return { "status": "loaded", @@ -174,6 +181,7 @@ def studio_demo_evidence(request: Request) -> dict[str, Any]: "compare_ready": True, "compare": compare, "evaluation_report": evaluation_report, + "problem_cases": problem_cases, "deployment_decision": compare["deployment_decision"], } @@ -327,10 +335,42 @@ def _load_demo_evaluation_report() -> dict[str, Any]: } +def _load_demo_problem_cases() -> list[dict[str, Any]]: + return [_load_problem_report(file_name) for file_name in DEMO_PROBLEM_REPORTS] + + +def _load_problem_report(file_name: str) -> dict[str, Any]: + path = VALIDATION_PROBLEM_DIR / file_name + try: + report = json.loads(path.read_text(encoding="utf-8")) + except OSError as exc: + raise HTTPException(status_code=500, detail=f"demo problem report not found: {file_name}") from exc + except json.JSONDecodeError as exc: + raise HTTPException(status_code=500, detail=f"demo problem report is invalid JSON: {file_name}") from exc + + problem_case = report.get("problem_case") if isinstance(report, dict) else None + deployment_signal = report.get("deployment_signal") if isinstance(report, dict) else None + structural = report.get("structural_validation") if isinstance(report, dict) else None + contract = report.get("contract_validation") if isinstance(report, dict) else None + accuracy = report.get("accuracy") if isinstance(report, dict) else None + if not isinstance(problem_case, str) or not isinstance(deployment_signal, dict): + raise HTTPException(status_code=500, detail=f"demo problem report schema error: {file_name}") + + return { + "problem_case": problem_case, + "source": f"examples/validation_demo/problem_cases/{file_name}", + "deployment_signal": deployment_signal, + "accuracy": accuracy if isinstance(accuracy, dict) else {}, + "structural_validation": structural if isinstance(structural, dict) else {}, + "contract_validation": contract if isinstance(contract, dict) else {}, + } + + def _build_demo_job( results: list[dict[str, Any]], compare: dict[str, Any], evaluation_report: dict[str, Any], + problem_cases: list[dict[str, Any]], ) -> dict[str, Any]: now = _utc_now_iso() runtime_result = results[-1] if results else {} @@ -350,6 +390,7 @@ def _build_demo_job( "comparison": compare, "deployment_decision": compare["deployment_decision"], "evaluation_report": evaluation_report, + "problem_cases": problem_cases, "summary": compare["judgement"]["summary"], }, "error": None, diff --git a/inferedgelab/studio/static/app.js b/inferedgelab/studio/static/app.js index c6cd079..39f2444 100644 --- a/inferedgelab/studio/static/app.js +++ b/inferedgelab/studio/static/app.js @@ -29,6 +29,7 @@ let compareData = null; let activeDecision = null; let importedResult = null; let demoEvaluationReport = null; +let demoProblemCases = []; const importedResultsByJobId = {}; function createElement(tagName, className, textContent) { @@ -367,6 +368,7 @@ async function loadDemoEvidence() { const results = Array.isArray(payload.results) ? payload.results : []; importedResult = results[results.length - 1] || null; demoEvaluationReport = payload.evaluation_report || null; + demoProblemCases = Array.isArray(payload.problem_cases) ? payload.problem_cases : []; compareData = payload.compare || null; selectedJobId = payload.job_id || payload.job?.job_id || selectedJobId; selectedJob = payload.job || selectedJob; @@ -376,6 +378,7 @@ async function loadDemoEvidence() { setStatus("#import-status", "Success: demo ONNX Runtime + TensorRT evidence imported.", "success"); renderImportEvidence({ result: importedResult }); renderDemoEvaluation(demoEvaluationReport); + renderDemoProblemCases(demoProblemCases); renderImportedResult(); await loadJobs(selectedJobId); await loadCompare(); @@ -388,6 +391,37 @@ async function loadDemoEvidence() { } } +function renderDemoProblemCases(problemCases = []) { + const target = document.querySelector("#demo-problem-cases"); + if (!target) { + return; + } + target.replaceChildren(); + + if (!problemCases.length) { + return; + } + + problemCases.forEach((problem) => { + const signal = problem.deployment_signal || {}; + const structural = problem.structural_validation || {}; + const contractShape = problem.contract_validation?.input_shape || {}; + const accuracy = problem.accuracy || {}; + const card = createElement("article", `problem-case ${decisionTone(signal.decision)}`); + card.append( + createElement("p", "caption", problem.problem_case || "problem case"), + createElement("h4", "", String(signal.decision || "review").toUpperCase()), + createElement("p", "body-text", signal.reason || "Validation evidence requires review."), + createElement( + "p", + "caption", + `accuracy=${accuracy.status || "-"} / structure=${structural.status || "-"} / contract=${contractShape.status || "-"}`, + ), + ); + target.append(card); + }); +} + function renderDemoEvaluation(report) { const target = document.querySelector("#demo-report-summary"); if (!target) { @@ -476,6 +510,7 @@ function renderRunPanel() { setState("#jetson-state", "idle"); setState("#demo-state", "idle"); renderDemoEvaluation(null); + renderDemoProblemCases([]); } function resetTransientInputs() { diff --git a/inferedgelab/studio/static/index.html b/inferedgelab/studio/static/index.html index 3594610..818aeda 100644 --- a/inferedgelab/studio/static/index.html +++ b/inferedgelab/studio/static/index.html @@ -137,8 +137,8 @@ } } - - + +
@@ -267,6 +267,7 @@

Replay validation evidence

+
@@ -332,7 +333,7 @@

Future Work

- - + + diff --git a/inferedgelab/studio/static/style.css b/inferedgelab/studio/static/style.css index 5ebdad2..e30150a 100644 --- a/inferedgelab/studio/static/style.css +++ b/inferedgelab/studio/static/style.css @@ -569,6 +569,32 @@ body.file-mode .file-protocol-warning { grid-column: 1 / -1; } +.problem-case-grid { + display: grid; + grid-template-columns: repeat(3, minmax(0, 1fr)); + gap: 8px; +} + +.problem-case { + border: 1px solid var(--line); + border-radius: 10px; + background: rgba(15, 23, 42, 0.78); + padding: 10px; +} + +.problem-case h4 { + margin: 4px 0 8px; + font-size: 0.95rem; +} + +.problem-case.blocked { + border-color: rgba(239, 68, 68, 0.35); +} + +.problem-case.review { + border-color: rgba(234, 179, 8, 0.35); +} + .metric-name, .metric-value { display: block; @@ -726,6 +752,10 @@ body.file-mode .file-protocol-warning { grid-template-columns: 1fr; } + .problem-case-grid { + grid-template-columns: 1fr; + } + .inline-fields, .future-heading { grid-template-columns: 1fr; diff --git a/tests/test_studio_routes.py b/tests/test_studio_routes.py index a9d2436..0440213 100644 --- a/tests/test_studio_routes.py +++ b/tests/test_studio_routes.py @@ -60,10 +60,10 @@ def test_studio_route_returns_local_studio_html(): assert "Import" in html assert "Jetson Helper" in html assert 'data-critical="studio-dark"' in html - assert 'href="/studio/static/style.css?v=16"' in html - assert 'href="style.css?v=16"' in html - assert 'src="/studio/static/app.js?v=16"' in html - assert 'src="app.js?v=16"' in html + assert 'href="/studio/static/style.css?v=17"' in html + assert 'href="style.css?v=17"' in html + assert 'src="/studio/static/app.js?v=17"' in html + assert 'src="app.js?v=17"' in html assert "file-protocol-warning" in html assert 'placeholder="results/latest.json"' in html assert 'value="results/latest.json"' not in html @@ -77,6 +77,7 @@ def test_studio_route_returns_local_studio_html(): assert "Load Demo Evidence" in html assert 'id="demo-state"' in html assert 'id="demo-report-summary"' in html + assert 'id="demo-problem-cases"' in html def test_studio_static_assets_are_served(): @@ -128,6 +129,7 @@ def test_studio_static_assets_include_redesigned_ui_contracts(): assert "request record only" in app_text assert "loadDemoEvidence" in app_text assert "renderDemoEvaluation" in app_text + assert "renderDemoProblemCases" in app_text assert "/studio/api/demo-evidence" in app_text assert "jobDisplayName" in app_text assert "jobCaption" in app_text @@ -145,6 +147,7 @@ def test_studio_static_assets_include_redesigned_ui_contracts(): assert ".compare-card.improvement" in style_text assert ".demo-card" in style_text assert ".demo-report-summary" in style_text + assert ".problem-case-grid" in style_text assert ".compare-stat-list" in style_text assert ".job-row .state-pill" in style_text assert "flex-wrap: wrap" in style_text @@ -353,6 +356,13 @@ def test_studio_demo_evidence_loads_compare_ready_pair(): assert response["evaluation_report"]["accuracy"]["status"] == "evaluated" assert response["evaluation_report"]["accuracy"]["metrics"]["map50"] > 0 assert response["evaluation_report"]["structural_validation"]["status"] == "passed" + assert len(response["problem_cases"]) == 3 + assert {case["problem_case"] for case in response["problem_cases"]} == { + "annotation_missing", + "invalid_detection_structure", + "contract_shape_mismatch", + } + assert {case["deployment_signal"]["decision"] for case in response["problem_cases"]} == {"review", "blocked"} assert compare["status"] == "ok" assert compare["base"]["backend_key"] == "onnxruntime__cpu" assert compare["new"]["backend_key"] == "tensorrt__jetson" @@ -378,6 +388,7 @@ def test_studio_demo_evidence_is_listed_and_selectable_as_job(): assert detail["result"]["comparison"]["base"]["backend_key"] == "onnxruntime__cpu" assert detail["result"]["comparison"]["new"]["backend_key"] == "tensorrt__jetson" assert detail["result"]["evaluation_report"]["accuracy"]["metrics"]["precision"] > 0 + assert detail["result"]["problem_cases"][1]["structural_validation"]["status"] == "failed" def test_studio_importing_two_compatible_results_returns_compare_data(): diff --git a/tests/test_validation_demo_report.py b/tests/test_validation_demo_report.py index 19cad6e..424c712 100644 --- a/tests/test_validation_demo_report.py +++ b/tests/test_validation_demo_report.py @@ -22,3 +22,25 @@ def test_yolov8_coco_subset_demo_report_contains_evaluated_accuracy(): assert round(report["accuracy"]["metrics"]["recall"], 4) == 0.1685 assert report["structural_validation"]["status"] == "passed" assert report["contract_validation"]["input_shape"]["status"] == "passed" + + +def test_validation_problem_case_reports_cover_review_and_blocked_paths(): + repo_root = Path(__file__).resolve().parents[1] + problem_dir = repo_root / "examples" / "validation_demo" / "problem_cases" + + reports = { + path.name: json.loads(path.read_text(encoding="utf-8")) + for path in sorted(problem_dir.glob("*_report.json")) + } + + assert set(reports) == { + "annotation_missing_report.json", + "contract_shape_mismatch_report.json", + "invalid_detection_structure_report.json", + } + assert reports["annotation_missing_report.json"]["accuracy"]["status"] == "skipped" + assert reports["annotation_missing_report.json"]["deployment_signal"]["decision"] == "review" + assert reports["invalid_detection_structure_report.json"]["structural_validation"]["status"] == "failed" + assert reports["invalid_detection_structure_report.json"]["deployment_signal"]["decision"] == "blocked" + assert reports["contract_shape_mismatch_report.json"]["contract_validation"]["input_shape"]["status"] == "mismatch" + assert reports["contract_shape_mismatch_report.json"]["deployment_signal"]["decision"] == "blocked"