diff --git a/inferedgelab/studio/routes.py b/inferedgelab/studio/routes.py index b5a08b6..026025d 100644 --- a/inferedgelab/studio/routes.py +++ b/inferedgelab/studio/routes.py @@ -102,7 +102,11 @@ def studio_job_detail(request: Request, job_id: str) -> dict[str, Any]: def studio_compare_latest(request: Request) -> dict[str, Any]: imported_results = _get_imported_results(request) if len(imported_results) >= 2: - return _build_imported_compare_response(imported_results[-2], imported_results[-1]) + return _build_imported_compare_response( + imported_results[-2], + imported_results[-1], + guard_analysis=_get_studio_guard_analysis(request), + ) endpoint = _get_api_endpoint(request.app, "/api/compare-latest") try: @@ -153,6 +157,7 @@ def studio_import(request: Request, payload: dict[str, Any] = Body(...)) -> dict result = _apply_backend_override(result, payload.get("backend_override")) imported_results = _get_imported_results(request) imported_results.append(result) + request.app.state.studio_guard_analysis = None return { "status": "imported", "source": "studio-memory", @@ -169,7 +174,13 @@ def studio_demo_evidence(request: Request) -> dict[str, Any]: problem_cases = _load_demo_problem_cases() imported_results = _get_imported_results(request) imported_results.extend(results) - compare = _build_imported_compare_response(results[0], results[1]) + guard_analysis = _build_demo_guard_analysis(results, evaluation_report) + request.app.state.studio_guard_analysis = guard_analysis + compare = _build_imported_compare_response( + results[0], + results[1], + guard_analysis=guard_analysis, + ) demo_job = _build_demo_job(results, compare, evaluation_report, problem_cases) _get_demo_jobs(request)[DEMO_JOB_ID] = demo_job return { @@ -183,6 +194,7 @@ def studio_demo_evidence(request: Request) -> dict[str, Any]: "compare": compare, "evaluation_report": evaluation_report, "problem_cases": problem_cases, + "guard_analysis": guard_analysis, "deployment_decision": compare["deployment_decision"], } @@ -217,6 +229,7 @@ def register_studio(app: FastAPI, job_store: Any | None = None) -> None: app.state.studio_job_store = job_store app.state.studio_imported_results = [] app.state.studio_demo_jobs = {} + app.state.studio_guard_analysis = None app.include_router(router) @@ -240,6 +253,11 @@ def _get_demo_jobs(request: Request) -> dict[str, dict[str, Any]]: return demo_jobs +def _get_studio_guard_analysis(request: Request) -> dict[str, Any] | None: + guard_analysis = getattr(request.app.state, "studio_guard_analysis", None) + return guard_analysis if isinstance(guard_analysis, dict) else None + + def _get_api_endpoint(app: FastAPI, path: str) -> Any: for route in app.routes: if getattr(route, "path", None) == path: @@ -269,25 +287,33 @@ def _load_import_payload(payload: dict[str, Any]) -> dict[str, Any]: return _with_compare_keys(result) -def _build_imported_compare_response(base: dict[str, Any], new: dict[str, Any]) -> dict[str, Any]: +def _build_imported_compare_response( + base: dict[str, Any], + new: dict[str, Any], + guard_analysis: dict[str, Any] | None = None, +) -> dict[str, Any]: result = compare_results(base, new) judgement = judge_comparison(result) - deployment_decision = build_deployment_decision(judgement) + deployment_decision = build_deployment_decision(judgement, guard_analysis=guard_analysis) + data = { + "base": base, + "new": new, + "result": result, + "judgement": judgement, + "deployment_decision": deployment_decision, + } + if guard_analysis is not None: + data["guard_analysis"] = guard_analysis return { "status": "ok", "source": "studio-memory", - "data": { - "base": base, - "new": new, - "result": result, - "judgement": judgement, - "deployment_decision": deployment_decision, - }, + "data": data, "base": base, "new": new, "result": result, "judgement": judgement, "deployment_decision": deployment_decision, + **({"guard_analysis": guard_analysis} if guard_analysis is not None else {}), } @@ -421,6 +447,7 @@ def _build_demo_job( "runtime_result": runtime_result, "comparison": compare, "deployment_decision": compare["deployment_decision"], + "guard_analysis": compare.get("guard_analysis"), "evaluation_report": evaluation_report, "problem_cases": problem_cases, "summary": compare["judgement"]["summary"], @@ -434,6 +461,107 @@ def _build_demo_job( } +def _build_demo_guard_analysis( + results: list[dict[str, Any]], + evaluation_report: dict[str, Any], +) -> dict[str, Any]: + baseline = results[0] if results else {} + candidate = results[-1] if results else {} + accuracy_metrics = evaluation_report.get("accuracy", {}).get("metrics", {}) + structural = evaluation_report.get("structural_validation") or {} + contract = evaluation_report.get("contract_validation", {}).get("input_shape") or {} + map50 = accuracy_metrics.get("map50") + precision = accuracy_metrics.get("precision") + recall = accuracy_metrics.get("recall") + verdict = "review_required" if isinstance(map50, (int, float)) and map50 < 0.2 else "pass" + severity = "medium" if verdict == "review_required" else "low" + source = { + "runtime_result_path": candidate.get("_source_path") or "examples/studio_demo/tensorrt_jetson_result.json", + "baseline_profile_path": baseline.get("_source_path") or "examples/studio_demo/onnxruntime_cpu_result.json", + "evaluation_report_path": evaluation_report.get("source"), + "model_contract_path": "examples/validation_demo/subset/model_contract.json", + "lab_result_path": "studio.demo_evidence", + } + evidence = [ + { + "type": "accuracy_signal", + "metric_name": "map50", + "observed_value": map50, + "baseline_value": None, + "threshold": 0.2, + "severity": severity, + "status": "warning" if verdict == "review_required" else "passed", + "explanation": ( + "Demo mAP50 is below the review threshold, so Lab should keep this as validation evidence " + "instead of treating latency speedup alone as deployment-ready." + ), + "why_it_matters": "Latency improvement does not prove detection quality is deployment-ready.", + "suspected_causes": [ + "Small validation subset", + "Model/preset calibration gap", + "Postprocess or threshold tuning needed", + ], + "recommendation": "Review accuracy evidence with a larger validation subset before deployment.", + "raw_context": { + "precision": precision, + "recall": recall, + "structural_status": structural.get("status"), + "contract_status": contract.get("status"), + }, + }, + { + "type": "contract_validation", + "metric_name": "input_shape_status", + "observed_value": contract.get("status"), + "baseline_value": "passed", + "threshold": "passed", + "severity": "low" if contract.get("status") == "passed" else "high", + "status": "passed" if contract.get("status") == "passed" else "failed", + "explanation": "The demo model contract input shape check is recorded as structured evidence.", + "why_it_matters": "Contract mismatch can make accuracy metrics unreliable.", + "suspected_causes": [], + "recommendation": "Keep model_contract evidence attached to the Lab report.", + "raw_context": contract, + }, + ] + return { + "schema_version": "inferedge-aiguard-diagnosis-v1", + "source": source, + "guard_verdict": verdict, + "severity": severity, + "confidence": 0.82, + "primary_reason": ( + "Latency improved, but demo accuracy evidence still requires review." + if verdict == "review_required" + else "Demo validation evidence is within configured Guard thresholds." + ), + "evidence": evidence, + "suspected_causes": [ + "Small validation subset", + "Detection threshold tuning needed", + ] + if verdict == "review_required" + else [], + "recommendations": [ + "Use this demo as portfolio evidence, then validate with a larger representative dataset before deployment.", + "Keep AIGuard evidence optional and let Lab own the final deployment decision.", + ], + "thresholds": {"map50_review": 0.2}, + "baseline_summary": { + "backend_key": baseline.get("backend_key"), + "mean_ms": baseline.get("mean_ms"), + "p99_ms": baseline.get("p99_ms"), + }, + "candidate_summary": { + "backend_key": candidate.get("backend_key"), + "mean_ms": candidate.get("mean_ms"), + "p99_ms": candidate.get("p99_ms"), + "map50": map50, + }, + "created_at": _utc_now_iso(), + } + + def _build_analyze_display_name(job: dict[str, Any]) -> str: input_summary = job.get("input_summary") or {} model_path = _first_display_value(input_summary.get("model_path"), input_summary.get("artifact_path")) diff --git a/inferedgelab/studio/static/app.js b/inferedgelab/studio/static/app.js index 5d17ff7..0987777 100644 --- a/inferedgelab/studio/static/app.js +++ b/inferedgelab/studio/static/app.js @@ -30,6 +30,7 @@ let activeDecision = null; let importedResult = null; let demoEvaluationReport = null; let demoProblemCases = []; +let activeGuardAnalysis = null; const importedResultsByJobId = {}; function createElement(tagName, className, textContent) { @@ -197,6 +198,7 @@ async function loadJobDetail(jobId) { renderJobDetail(); renderJobList(); updateDecision(extractDecision(selectedJob)); + updateGuardEvidence(extractGuardAnalysis(selectedJob)); } catch (error) { selectedJob = null; renderJobDetail(`Unable to load ${jobId}: ${formatError(error)}`); @@ -210,6 +212,7 @@ async function loadCompare() { compareData = await fetchJson("/studio/api/compare/latest"); renderCompare(); const decision = extractDecision(compareData); + updateGuardEvidence(extractGuardAnalysis(compareData)); if (compareData.status === "empty") { activeDecision = null; renderDecision(decision); @@ -219,6 +222,7 @@ async function loadCompare() { } catch (error) { compareData = { status: "error", error: formatError(error) }; renderCompare(); + updateGuardEvidence(null); if (!activeDecision) { updateDecision(null); } @@ -370,6 +374,7 @@ async function loadDemoEvidence() { demoEvaluationReport = payload.evaluation_report || null; demoProblemCases = Array.isArray(payload.problem_cases) ? payload.problem_cases : []; compareData = payload.compare || null; + updateGuardEvidence(payload.guard_analysis || payload.compare?.guard_analysis || null); selectedJobId = payload.job_id || payload.job?.job_id || selectedJobId; selectedJob = payload.job || selectedJob; setState("#demo-state", "completed"); @@ -731,6 +736,123 @@ function updateDecision(decision) { renderDecision(decision); } +function renderGuardEvidence(guardAnalysis) { + const target = document.querySelector("#guard-evidence-panel"); + if (!target) { + return; + } + target.replaceChildren(); + + if (!guardAnalysis) { + target.className = "guard-panel idle"; + target.append( + createElement("p", "caption", "AIGuard"), + createElement("h3", "", "OPTIONAL"), + createElement("p", "body-text", "No AIGuard diagnosis evidence is loaded for this local workflow yet."), + createElement("p", "caption", "Load Demo Evidence or run compare with guard-backed diagnosis evidence."), + ); + return; + } + + const verdict = guardVerdict(guardAnalysis); + target.className = `guard-panel ${decisionTone(verdict)}`; + target.append( + createElement("p", "caption", "AIGuard diagnosis evidence"), + createElement("h3", "", verdict.toUpperCase()), + createElement("p", "body-text", guardAnalysis.primary_reason || guardAnalysis.reason || "Guard evidence is available."), + guardSummary(guardAnalysis, verdict), + ); + + const source = guardAnalysis.source || {}; + if (Object.keys(source).length > 0) { + const sourcePanel = createElement("div", "guard-source"); + sourcePanel.append(createElement("strong", "", "Source")); + Object.entries(source).forEach(([key, value]) => { + if (value !== undefined && value !== null && value !== "") { + sourcePanel.append(evidenceItem(key, value)); + } + }); + target.append(sourcePanel); + } + + const evidence = guardEvidenceItems(guardAnalysis); + if (evidence.length > 0) { + const table = createElement("div", "guard-evidence-table"); + table.append(guardEvidenceRow(["type", "metric", "observed", "threshold", "status"], true)); + evidence.forEach((item) => { + table.append( + guardEvidenceRow([ + item.type || "-", + item.metric_name || "-", + item.observed_value, + item.threshold, + item.status || item.severity || "-", + ]), + ); + }); + target.append(table); + target.append(guardExplanations(evidence)); + } + + target.append(guardList("Suspected causes", guardAnalysis.suspected_causes)); + target.append(guardList("Recommendations", guardAnalysis.recommendations)); +} + +function updateGuardEvidence(guardAnalysis) { + activeGuardAnalysis = guardAnalysis || null; + renderGuardEvidence(activeGuardAnalysis); +} + +function guardSummary(guardAnalysis, verdict) { + const summary = createElement("div", "guard-summary"); + summary.append( + evidenceItem("guard_verdict", verdict), + evidenceItem("severity", guardAnalysis.severity || "-"), + evidenceItem("confidence", guardAnalysis.confidence ?? "-"), + evidenceItem("schema", guardAnalysis.schema_version || "legacy"), + ); + return summary; +} + +function guardEvidenceRow(values, heading = false) { + const row = createElement("div", heading ? "guard-row guard-row-heading" : "guard-row"); + values.forEach((value) => row.append(createElement("span", "", formatValue(value)))); + return row; +} + +function guardExplanations(evidence) { + const explanations = createElement("div", "guard-explanations"); + evidence.forEach((item) => { + if (!item.explanation && !item.recommendation) { + return; + } + const explanation = createElement("article", "detail-note"); + explanation.append( + createElement("strong", "", item.metric_name || item.type || "evidence"), + createElement("p", "body-text", item.explanation || "-"), + ); + if (item.recommendation) { + explanation.append(createElement("p", "caption", `Recommendation: ${item.recommendation}`)); + } + explanations.append(explanation); + }); + return explanations; +} + +function guardList(title, values) { + const panel = createElement("div", "guard-list"); + panel.append(createElement("strong", "", title)); + const list = createElement("ul", ""); + const items = Array.isArray(values) ? values : values ? [values] : []; + if (!items.length) { + list.append(createElement("li", "", "-")); + } else { + items.forEach((value) => list.append(createElement("li", "", formatValue(value)))); + } + panel.append(list); + return panel; +} + function metricTile(label, value) { const tile = createElement("div", "metric-tile"); tile.append(createElement("span", "metric-name", label), createElement("span", "metric-value", value)); @@ -835,6 +957,50 @@ function extractDecision(payload) { return payload.deployment_decision || payload.result?.deployment_decision || payload.data?.deployment_decision || null; } +function extractGuardAnalysis(payload) { + if (!payload) { + return null; + } + return ( + payload.guard_analysis || + payload.result?.guard_analysis || + payload.data?.guard_analysis || + payload.result?.comparison?.guard_analysis || + payload.result?.comparison?.data?.guard_analysis || + null + ); +} + +function guardVerdict(guardAnalysis = {}) { + if (guardAnalysis.guard_verdict) { + return String(guardAnalysis.guard_verdict); + } + const status = String(guardAnalysis.status || "").toLowerCase(); + if (status === "ok") { + return "pass"; + } + if (status === "warning") { + return "review_required"; + } + if (status === "error") { + return "blocked"; + } + if (status === "skipped") { + return "skipped"; + } + return "unknown"; +} + +function guardEvidenceItems(guardAnalysis = {}) { + if (Array.isArray(guardAnalysis.evidence)) { + return guardAnalysis.evidence; + } + if (Array.isArray(guardAnalysis.anomalies)) { + return guardAnalysis.anomalies; + } + return []; +} + function decisionReason(decision) { const decisionName = String(decision?.decision || "unknown").toLowerCase(); if (decisionName === "unknown" && !decision?.guard_status) { @@ -883,7 +1049,7 @@ function pipelineStatus() { const anyCompleted = currentJobs.some((job) => job.status === "completed") || Boolean(importedResult); const hasCompareDecision = Boolean(activeDecision); const hasImportedEvidence = Boolean(importedResult); - const hasGuardEvidence = Boolean(activeDecision?.guard_status); + const hasGuardEvidence = Boolean(activeGuardAnalysis || activeDecision?.guard_status || activeDecision?.guard_verdict); return { forge: importedResult ? "completed" : "idle", runtime: hasImportedEvidence || anyCompleted ? "completed" : anyRunning ? "running" : "idle", @@ -897,7 +1063,7 @@ function normalizeState(state) { if (value === "queued") { return "running"; } - if (value === "completed" || value === "success" || value === "deployable") { + if (value === "completed" || value === "success" || value === "deployable" || value === "pass") { return "completed"; } if (value === "failed" || value === "blocked" || value === "error") { @@ -1064,6 +1230,7 @@ async function initLocalStudio() { renderJobDetail(); renderCompare(); updateDecision(null); + updateGuardEvidence(null); await loadJobs(); await loadCompare(); await loadJetsonCommand(); diff --git a/inferedgelab/studio/static/index.html b/inferedgelab/studio/static/index.html index 818aeda..bd07216 100644 --- a/inferedgelab/studio/static/index.html +++ b/inferedgelab/studio/static/index.html @@ -137,8 +137,8 @@ } } - - + +
@@ -311,9 +311,20 @@

Compare View

-
+

05

+
+

AIGuard Evidence

+

Optional deterministic diagnosis evidence explains why a result should pass, require review, or be blocked.

+
+
+
+
+ +
+
+

06

Deployment Decision

Lab's local gate for deploy, review, or block. AIGuard evidence is optional and not required for this Studio flow.

@@ -333,7 +344,7 @@

Future Work

- - + + diff --git a/inferedgelab/studio/static/style.css b/inferedgelab/studio/static/style.css index e30150a..4c1c431 100644 --- a/inferedgelab/studio/static/style.css +++ b/inferedgelab/studio/static/style.css @@ -675,6 +675,98 @@ body.file-mode .file-protocol-warning { min-height: 156px; } +.guard-panel { + border: 1px solid var(--line); + border-radius: 12px; + background: var(--panel); + box-shadow: var(--shadow); + display: grid; + gap: 14px; + padding: 16px; +} + +.guard-panel h3 { + margin: 0; + font-size: clamp(1.8rem, 3vw, 2.6rem); +} + +.guard-panel.deployable { + border-color: rgba(34, 197, 94, 0.35); + background: linear-gradient(180deg, rgba(34, 197, 94, 0.08), var(--panel)); +} + +.guard-panel.review { + border-color: rgba(234, 179, 8, 0.35); + background: linear-gradient(180deg, rgba(234, 179, 8, 0.08), var(--panel)); +} + +.guard-panel.blocked { + border-color: rgba(239, 68, 68, 0.35); + background: linear-gradient(180deg, rgba(239, 68, 68, 0.08), var(--panel)); +} + +.guard-summary, +.guard-source { + display: grid; + grid-template-columns: repeat(4, minmax(0, 1fr)); + gap: 8px; +} + +.guard-source strong, +.guard-list strong { + grid-column: 1 / -1; +} + +.guard-evidence-table { + display: grid; + gap: 6px; +} + +.guard-row { + display: grid; + grid-template-columns: 1.1fr 1.2fr 0.8fr 0.8fr 0.8fr; + gap: 8px; + border: 1px solid var(--line); + border-radius: 10px; + background: rgba(15, 23, 42, 0.78); + padding: 10px; +} + +.guard-row span { + min-width: 0; + overflow-wrap: anywhere; +} + +.guard-row-heading { + color: var(--caption); + font-size: 0.76rem; + font-weight: 900; + text-transform: uppercase; +} + +.guard-explanations { + display: grid; + gap: 8px; +} + +.guard-list { + border: 1px solid var(--line); + border-radius: 10px; + background: rgba(15, 23, 42, 0.78); + padding: 12px; +} + +.guard-list ul { + margin: 8px 0 0; + padding-left: 18px; +} + +.guard-list li { + color: var(--muted); + line-height: 1.45; + margin-top: 4px; +} + .decision-card h3 { margin-top: 8px; font-size: clamp(2rem, 4vw, 3.25rem); @@ -756,6 +848,12 @@ body.file-mode .file-protocol-warning { grid-template-columns: 1fr; } + .guard-summary, + .guard-source, + .guard-row { + grid-template-columns: 1fr; + } + .inline-fields, .future-heading { grid-template-columns: 1fr; diff --git a/tests/test_studio_routes.py b/tests/test_studio_routes.py index 6aaeb95..b3dc8ba 100644 --- a/tests/test_studio_routes.py +++ b/tests/test_studio_routes.py @@ -60,10 +60,10 @@ def test_studio_route_returns_local_studio_html(): assert "Import" in html assert "Jetson Helper" in html assert 'data-critical="studio-dark"' in html - assert 'href="/studio/static/style.css?v=17"' in html - assert 'href="style.css?v=17"' in html - assert 'src="/studio/static/app.js?v=17"' in html - assert 'src="app.js?v=17"' in html + assert 'href="/studio/static/style.css?v=18"' in html + assert 'href="style.css?v=18"' in html + assert 'src="/studio/static/app.js?v=18"' in html + assert 'src="app.js?v=18"' in html assert "file-protocol-warning" in html assert 'placeholder="results/latest.json"' in html assert 'value="results/latest.json"' not in html @@ -74,6 +74,8 @@ def test_studio_route_returns_local_studio_html(): assert 'id="import-backend-preset"' in html assert "TensorRT / Jetson" in html assert "Lab's local gate" in html + assert "AIGuard Evidence" in html + assert 'id="guard-evidence-panel"' in html assert "Load Demo Evidence" in html assert 'id="demo-state"' in html assert 'id="demo-report-summary"' in html @@ -130,6 +132,9 @@ def test_studio_static_assets_include_redesigned_ui_contracts(): assert "loadDemoEvidence" in app_text assert "renderDemoEvaluation" in app_text assert "renderDemoProblemCases" in app_text + assert "renderGuardEvidence" in app_text + assert "guardEvidenceItems" in app_text + assert "guard_verdict" in app_text assert "/studio/api/demo-evidence" in app_text assert "jobDisplayName" in app_text assert "jobCaption" in app_text @@ -148,6 +153,9 @@ def test_studio_static_assets_include_redesigned_ui_contracts(): assert ".demo-card" in style_text assert ".demo-report-summary" in style_text assert ".problem-case-grid" in style_text + assert ".guard-panel" in style_text + assert ".guard-evidence-table" in style_text + assert ".guard-row" in style_text assert ".compare-stat-list" in style_text assert ".job-row .state-pill" in style_text assert "flex-wrap: wrap" in style_text @@ -351,7 +359,10 @@ def test_studio_demo_evidence_loads_compare_ready_pair(): assert response["results"][1]["mean_ms"] == 9.9375 assert response["compare"]["status"] == "ok" assert response["compare"]["judgement"]["overall"] == "improvement" - assert response["deployment_decision"]["decision"] == "unknown" + assert response["guard_analysis"]["guard_verdict"] == "review_required" + assert response["guard_analysis"]["evidence"][0]["metric_name"] == "map50" + assert response["deployment_decision"]["decision"] == "review_required" + assert response["deployment_decision"]["guard_verdict"] == "review_required" assert response["evaluation_report"]["preset"] == "yolov8_coco" assert response["evaluation_report"]["accuracy"]["status"] == "evaluated" assert response["evaluation_report"]["accuracy"]["metrics"]["map50"] > 0 @@ -372,6 +383,7 @@ def test_studio_demo_evidence_loads_compare_ready_pair(): assert latency_case["latency_checks"]["p99_latency"]["delta_pct"] > 20 assert latency_case["deployment_signal"]["reason"] == "p99 latency regression detected" assert compare["status"] == "ok" + assert compare["guard_analysis"]["guard_verdict"] == "review_required" assert compare["base"]["backend_key"] == "onnxruntime__cpu" assert compare["new"]["backend_key"] == "tensorrt__jetson" @@ -393,6 +405,7 @@ def test_studio_demo_evidence_is_listed_and_selectable_as_job(): assert detail["job_id"] == "demo_yolov8n_trt_vs_onnx" assert detail["status"] == "completed" assert detail["result"]["runtime_result"]["backend_key"] == "tensorrt__jetson" + assert detail["result"]["guard_analysis"]["guard_verdict"] == "review_required" assert detail["result"]["comparison"]["base"]["backend_key"] == "onnxruntime__cpu" assert detail["result"]["comparison"]["new"]["backend_key"] == "tensorrt__jetson" assert detail["result"]["evaluation_report"]["accuracy"]["metrics"]["precision"] > 0