diff --git a/examples/studio_demo/aiguard_portfolio_cases.json b/examples/studio_demo/aiguard_portfolio_cases.json new file mode 100644 index 0000000..cc6fcc4 --- /dev/null +++ b/examples/studio_demo/aiguard_portfolio_cases.json @@ -0,0 +1,985 @@ +{ + "schema_version": "inferedge-aiguard-portfolio-demo-v1", + "source": "InferEdgeAIGuard Phase 6 portfolio demo cases", + "scope": "local-first evidence replay", + "case_count": 4, + "cases": [ + { + "case_id": "normal_pass", + "title": "Normal output quality", + "category": "normal", + "summary": "BBox and score evidence stay within configured thresholds.", + "guard_analysis": { + "schema_version": "inferedge-aiguard-diagnosis-v1", + "source": { + "runtime_result_path": "examples/single/fp32_normal.json" + }, + "guard_verdict": "pass", + "severity": "low", + "confidence": 0.9, + "primary_reason": "BBox and score structural evidence is within configured thresholds.", + "evidence": [ + { + "type": "bbox_validity", + "metric_name": "invalid_bbox_rate", + "observed_value": 0.0, + "baseline_value": null, + "threshold": 0.05, + "delta": null, + "delta_pct": null, + "increase_factor": null, + "severity": "low", + "status": "passed", + "explanation": "invalid_bbox_rate observed value is 0. Threshold is 0.05. This bbox_validity evidence should be reviewed before deployment.", + "why_it_matters": "Invalid boxes indicate that detection geometry may be unusable even when latency looks acceptable.", + "suspected_causes": [], + "recommendation": "BBox validity is within the configured threshold.", + "raw_context": { + "total_predictions": 3, + "invalid_bbox_count": 0, + "invalid_bbox_rate": 0.0, + "zero_area_count": 0, + "out_of_bounds_count": 0, + "nan_or_inf_count": 0, + "bbox_collapse_count": 0, + "bbox_collapse_ratio": 0.0, + "boundary_check_enabled": false + } + }, + { + "type": "bbox_collapse", + "metric_name": "bbox_collapse_ratio", + "observed_value": 0.0, + "baseline_value": null, + "threshold": 0.05, + "delta": null, + "delta_pct": null, + "increase_factor": null, + "severity": "low", + "status": "passed", + "explanation": "bbox_collapse_ratio observed value is 0. Threshold is 0.05. This bbox_collapse evidence should be reviewed before deployment.", + "why_it_matters": "Near-zero area boxes can indicate decoder mismatch or quantization artifacts that make detections unusable.", + "suspected_causes": [], + "recommendation": "BBox collapse ratio is within the configured threshold.", + "raw_context": { + "total_predictions": 3, + "invalid_bbox_count": 0, + "invalid_bbox_rate": 0.0, + "zero_area_count": 0, + "out_of_bounds_count": 0, + "nan_or_inf_count": 0, + "bbox_collapse_count": 0, + "bbox_collapse_ratio": 0.0, + "boundary_check_enabled": false + } + }, + { + "type": "score_range_violation", + "metric_name": "score_range_violation_count", + "observed_value": 0, + "baseline_value": null, + "threshold": 0, + "delta": null, + "delta_pct": null, + "increase_factor": null, + "severity": "low", + "status": "passed", + "explanation": "score_range_violation_count observed value is 0. Threshold is 0. This score_range_violation evidence should be reviewed before deployment.", + "why_it_matters": "Scores outside 0..1 or non-finite scores make ranking and thresholding unreliable.", + "suspected_causes": [], + "recommendation": "Score range is valid.", + "raw_context": { + "total_predictions": 3, + "min_score": 0.63, + "max_score": 0.91, + "mean_score": 0.7600000000000001, + "std_score": 0.11518101695447332, + "low_confidence_ratio": 0.0, + "high_confidence_ratio": 0.0, + "saturation_ratio": 0.0, + "score_range_violation_count": 0, + "score_nan_or_inf_count": 0, + "low_threshold": 0.01, + "high_threshold": 0.99 + } + }, + { + "type": "confidence_saturation", + "metric_name": "saturation_ratio", + "observed_value": 0.0, + "baseline_value": null, + "threshold": 0.7, + "delta": null, + "delta_pct": null, + "increase_factor": null, + "severity": "low", + "status": "passed", + "explanation": "saturation_ratio observed value is 0. Threshold is 0.7. This confidence_saturation evidence should be reviewed before deployment.", + "why_it_matters": "Confidence saturation can hide ranking quality problems and may indicate quantization or postprocess mistakes.", + "suspected_causes": [], + "recommendation": "Confidence distribution is within the configured threshold.", + "raw_context": { + "total_predictions": 3, + "min_score": 0.63, + "max_score": 0.91, + "mean_score": 0.7600000000000001, + "std_score": 0.11518101695447332, + "low_confidence_ratio": 0.0, + "high_confidence_ratio": 0.0, + "saturation_ratio": 0.0, + "score_range_violation_count": 0, + "score_nan_or_inf_count": 0, + "low_threshold": 0.01, + "high_threshold": 0.99 + } + } + ], + "suspected_causes": [], + "recommendations": [ + "BBox validity is within the configured threshold.", + "BBox collapse ratio is within the configured threshold.", + "Score range is valid.", + "Confidence distribution is within the configured threshold." + ], + "thresholds": { + "invalid_bbox_rate_review": 0.05, + "invalid_bbox_rate_blocked": 0.2, + "bbox_collapse_ratio_review": 0.05, + "bbox_collapse_ratio_high": 0.1, + "score_saturation_low_threshold": 0.01, + "score_saturation_high_threshold": 0.99, + "saturation_ratio_review": 0.7, + "saturation_ratio_high": 0.85 + }, + "baseline_summary": {}, + "candidate_summary": { + "model": "yolov8n", + "precision": "fp32", + "image_id": "sample_001", + "bbox": { + "total_predictions": 3, + "invalid_bbox_count": 0, + "invalid_bbox_rate": 0.0, + "zero_area_count": 0, + "out_of_bounds_count": 0, + "nan_or_inf_count": 0, + "bbox_collapse_count": 0, + "bbox_collapse_ratio": 0.0, + "boundary_check_enabled": false + }, + "score": { + "total_predictions": 3, + "min_score": 0.63, + "max_score": 0.91, + "mean_score": 0.7600000000000001, + "std_score": 0.11518101695447332, + "low_confidence_ratio": 0.0, + "high_confidence_ratio": 0.0, + "saturation_ratio": 0.0, + "score_range_violation_count": 0, + "score_nan_or_inf_count": 0, + "low_threshold": 0.01, + "high_threshold": 0.99 + } + }, + "created_at": "2026-05-02T12:54:53Z" + } + }, + { + "case_id": "bbox_collapse_blocked", + "title": "Latency improvement with bbox collapse", + "category": "bbox_quality", + "summary": "Candidate latency improves, but bbox collapse increases enough to block deployment review.", + "guard_analysis": { + "schema_version": "inferedge-aiguard-diagnosis-v1", + "source": { + "baseline_result_path": "examples/single/fp32_normal.json", + "candidate_result_path": "examples/single/int8_bbox_collapse.json" + }, + "guard_verdict": "blocked", + "severity": "high", + "confidence": 0.92, + "primary_reason": "Candidate output quality deviates from the baseline diagnosis profile.", + "evidence": [ + { + "type": "baseline_deviation", + "metric_name": "invalid_bbox_rate_factor", + "observed_value": 500000000.0, + "baseline_value": 0.0, + "threshold": 5.0, + "delta": 0.5, + "delta_pct": null, + "increase_factor": 500000000.0, + "severity": "high", + "status": "failed", + "explanation": "invalid_bbox_rate changed from 0 in the baseline to 0.5 in the candidate (500000000x). Review threshold is 5x.", + "why_it_matters": "A candidate with a much higher invalid bbox rate than the baseline may have unusable detection geometry even if it runs faster.", + "suspected_causes": [ + "Incorrect bbox decoder", + "Output tensor layout mismatch", + "Preprocessing/postprocess mismatch" + ], + "recommendation": "Review decoder, output layout, and postprocess settings.", + "raw_context": { + "baseline_detection_count": 3, + "candidate_detection_count": 2, + "detection_count_delta": -1, + "detection_count_delta_pct": -0.3333333333333333, + "detection_count_drop_pct": 0.3333333333333333, + "invalid_bbox_rate_factor": 500000000.0, + "bbox_collapse_ratio_factor": 500000000.0, + "score_saturation_factor": 1.0, + "latency_delta_pct": -0.735857362976007, + "accuracy_delta_pp": null + } + }, + { + "type": "baseline_deviation", + "metric_name": "bbox_collapse_ratio_factor", + "observed_value": 500000000.0, + "baseline_value": 0.0, + "threshold": 5.0, + "delta": 0.5, + "delta_pct": null, + "increase_factor": 500000000.0, + "severity": "high", + "status": "failed", + "explanation": "bbox_collapse_ratio changed from 0 in the baseline to 0.5 in the candidate (500000000x). Review threshold is 5x.", + "why_it_matters": "Near-zero boxes increasing over baseline can indicate decoder or quantization problems that make detections unreliable.", + "suspected_causes": [ + "INT8 quantization artifact", + "Incorrect bbox decoder", + "Preprocessing/postprocess mismatch" + ], + "recommendation": "Do not deploy until bbox collapse drift is reviewed.", + "raw_context": { + "baseline_detection_count": 3, + "candidate_detection_count": 2, + "detection_count_delta": -1, + "detection_count_delta_pct": -0.3333333333333333, + "detection_count_drop_pct": 0.3333333333333333, + "invalid_bbox_rate_factor": 500000000.0, + "bbox_collapse_ratio_factor": 500000000.0, + "score_saturation_factor": 1.0, + "latency_delta_pct": -0.735857362976007, + "accuracy_delta_pp": null + } + }, + { + "type": "baseline_deviation", + "metric_name": "score_saturation_factor", + "observed_value": 1.0, + "baseline_value": 0.0, + "threshold": 5.0, + "delta": 0.0, + "delta_pct": null, + "increase_factor": 1.0, + "severity": "low", + "status": "passed", + "explanation": "saturation_ratio changed from 0 in the baseline to 0 in the candidate (1x). Review threshold is 5x.", + "why_it_matters": "Score saturation drift can hide ranking failures and may indicate quantization or score decoder mistakes.", + "suspected_causes": [], + "recommendation": "Baseline deviation is within threshold.", + "raw_context": { + "baseline_detection_count": 3, + "candidate_detection_count": 2, + "detection_count_delta": -1, + "detection_count_delta_pct": -0.3333333333333333, + "detection_count_drop_pct": 0.3333333333333333, + "invalid_bbox_rate_factor": 500000000.0, + "bbox_collapse_ratio_factor": 500000000.0, + "score_saturation_factor": 1.0, + "latency_delta_pct": -0.735857362976007, + "accuracy_delta_pp": null + } + }, + { + "type": "detection_count_drift", + "metric_name": "detection_count_drop_pct", + "observed_value": 0.3333333333333333, + "baseline_value": 3, + "threshold": 0.5, + "delta": -1, + "delta_pct": -0.3333333333333333, + "increase_factor": null, + "severity": "low", + "status": "passed", + "explanation": "Candidate detection count changed from 3 to 2 (-0.333 signed delta).", + "why_it_matters": "A large detection count drop can mean the candidate became faster by missing objects rather than by improving execution efficiency.", + "suspected_causes": [], + "recommendation": "Detection count drift is within threshold.", + "raw_context": { + "baseline_detection_count": 3, + "candidate_detection_count": 2, + "detection_count_delta": -1, + "detection_count_delta_pct": -0.3333333333333333, + "detection_count_drop_pct": 0.3333333333333333, + "invalid_bbox_rate_factor": 500000000.0, + "bbox_collapse_ratio_factor": 500000000.0, + "score_saturation_factor": 1.0, + "latency_delta_pct": -0.735857362976007, + "accuracy_delta_pp": null + } + }, + { + "type": "latency_quality_tradeoff", + "metric_name": "latency_delta_pct", + "observed_value": -0.735857362976007, + "baseline_value": 45.43, + "threshold": 0, + "delta": -33.43, + "delta_pct": -0.735857362976007, + "increase_factor": null, + "severity": "high", + "status": "failed", + "explanation": "Candidate latency improved by 0.736, but baseline comparison evidence indicates output quality risk.", + "why_it_matters": "Latency improvement alone is not deployment-ready evidence when bbox, score, or detection count quality regresses.", + "suspected_causes": [ + "Quantization artifact", + "Decoder mismatch", + "Run configuration mismatch" + ], + "recommendation": "Treat speedup as review evidence until output quality drift is resolved.", + "raw_context": { + "baseline_detection_count": 3, + "candidate_detection_count": 2, + "detection_count_delta": -1, + "detection_count_delta_pct": -0.3333333333333333, + "detection_count_drop_pct": 0.3333333333333333, + "invalid_bbox_rate_factor": 500000000.0, + "bbox_collapse_ratio_factor": 500000000.0, + "score_saturation_factor": 1.0, + "latency_delta_pct": -0.735857362976007, + "accuracy_delta_pp": null + } + } + ], + "suspected_causes": [ + "Incorrect bbox decoder", + "Output tensor layout mismatch", + "Preprocessing/postprocess mismatch", + "INT8 quantization artifact", + "Quantization artifact", + "Decoder mismatch", + "Run configuration mismatch" + ], + "recommendations": [ + "Review decoder, output layout, and postprocess settings.", + "Do not deploy until bbox collapse drift is reviewed.", + "Baseline deviation is within threshold.", + "Detection count drift is within threshold.", + "Treat speedup as review evidence until output quality drift is resolved." + ], + "thresholds": { + "invalid_bbox_rate_review": 0.05, + "invalid_bbox_rate_blocked": 0.2, + "bbox_collapse_ratio_review": 0.05, + "bbox_collapse_ratio_high": 0.1, + "score_saturation_low_threshold": 0.01, + "score_saturation_high_threshold": 0.99, + "saturation_ratio_review": 0.7, + "saturation_ratio_high": 0.85, + "invalid_bbox_rate_factor_review": 5.0, + "invalid_bbox_rate_factor_blocked": 10.0, + "bbox_collapse_ratio_factor_review": 5.0, + "bbox_collapse_ratio_factor_blocked": 10.0, + "score_saturation_factor_review": 5.0, + "score_saturation_factor_blocked": 10.0, + "detection_count_drop_pct_review": 0.5, + "detection_count_drop_pct_blocked": 0.8 + }, + "baseline_summary": { + "label": "fp32_baseline", + "model": "yolov8n", + "precision": "fp32", + "bbox": { + "total_predictions": 3, + "invalid_bbox_count": 0, + "invalid_bbox_rate": 0.0, + "zero_area_count": 0, + "out_of_bounds_count": 0, + "nan_or_inf_count": 0, + "bbox_collapse_count": 0, + "bbox_collapse_ratio": 0.0, + "boundary_check_enabled": false + }, + "score": { + "total_predictions": 3, + "min_score": 0.63, + "max_score": 0.91, + "mean_score": 0.7600000000000001, + "std_score": 0.11518101695447332, + "low_confidence_ratio": 0.0, + "high_confidence_ratio": 0.0, + "saturation_ratio": 0.0, + "score_range_violation_count": 0, + "score_nan_or_inf_count": 0, + "low_threshold": 0.01, + "high_threshold": 0.99 + }, + "latency_ms": 45.43, + "accuracy": null + }, + "candidate_summary": { + "label": "int8_bbox_collapse", + "model": "yolov8n", + "precision": "int8", + "bbox": { + "total_predictions": 2, + "invalid_bbox_count": 1, + "invalid_bbox_rate": 0.5, + "zero_area_count": 1, + "out_of_bounds_count": 0, + "nan_or_inf_count": 0, + "bbox_collapse_count": 1, + "bbox_collapse_ratio": 0.5, + "boundary_check_enabled": false + }, + "score": { + "total_predictions": 2, + "min_score": 0.72, + "max_score": 0.89, + "mean_score": 0.8049999999999999, + "std_score": 0.08500000000000002, + "low_confidence_ratio": 0.0, + "high_confidence_ratio": 0.0, + "saturation_ratio": 0.0, + "score_range_violation_count": 0, + "score_nan_or_inf_count": 0, + "low_threshold": 0.01, + "high_threshold": 0.99 + }, + "latency_ms": 12.0, + "accuracy": null, + "comparison": { + "baseline_detection_count": 3, + "candidate_detection_count": 2, + "detection_count_delta": -1, + "detection_count_delta_pct": -0.3333333333333333, + "detection_count_drop_pct": 0.3333333333333333, + "invalid_bbox_rate_factor": 500000000.0, + "bbox_collapse_ratio_factor": 500000000.0, + "score_saturation_factor": 1.0, + "latency_delta_pct": -0.735857362976007, + "accuracy_delta_pp": null + } + }, + "created_at": "2026-05-02T12:54:53Z" + } + }, + { + "case_id": "score_saturation_blocked", + "title": "Confidence score saturation", + "category": "confidence_distribution", + "summary": "Candidate scores concentrate near 0 or 1, indicating possible quantization or postprocess risk.", + "guard_analysis": { + "schema_version": "inferedge-aiguard-diagnosis-v1", + "source": { + "baseline_result_path": "examples/single/fp32_normal.json", + "candidate_result_path": "examples/single/int8_conf_saturation.json" + }, + "guard_verdict": "blocked", + "severity": "high", + "confidence": 0.92, + "primary_reason": "Candidate output quality deviates from the baseline diagnosis profile.", + "evidence": [ + { + "type": "baseline_deviation", + "metric_name": "invalid_bbox_rate_factor", + "observed_value": 1.0, + "baseline_value": 0.0, + "threshold": 5.0, + "delta": 0.0, + "delta_pct": null, + "increase_factor": 1.0, + "severity": "low", + "status": "passed", + "explanation": "invalid_bbox_rate changed from 0 in the baseline to 0 in the candidate (1x). Review threshold is 5x.", + "why_it_matters": "A candidate with a much higher invalid bbox rate than the baseline may have unusable detection geometry even if it runs faster.", + "suspected_causes": [], + "recommendation": "Baseline deviation is within threshold.", + "raw_context": { + "baseline_detection_count": 3, + "candidate_detection_count": 5, + "detection_count_delta": 2, + "detection_count_delta_pct": 0.6666666666666666, + "detection_count_drop_pct": 0.0, + "invalid_bbox_rate_factor": 1.0, + "bbox_collapse_ratio_factor": 1.0, + "score_saturation_factor": 1000000000.0, + "latency_delta_pct": -0.6808276469293418, + "accuracy_delta_pp": null + } + }, + { + "type": "baseline_deviation", + "metric_name": "bbox_collapse_ratio_factor", + "observed_value": 1.0, + "baseline_value": 0.0, + "threshold": 5.0, + "delta": 0.0, + "delta_pct": null, + "increase_factor": 1.0, + "severity": "low", + "status": "passed", + "explanation": "bbox_collapse_ratio changed from 0 in the baseline to 0 in the candidate (1x). Review threshold is 5x.", + "why_it_matters": "Near-zero boxes increasing over baseline can indicate decoder or quantization problems that make detections unreliable.", + "suspected_causes": [], + "recommendation": "Baseline deviation is within threshold.", + "raw_context": { + "baseline_detection_count": 3, + "candidate_detection_count": 5, + "detection_count_delta": 2, + "detection_count_delta_pct": 0.6666666666666666, + "detection_count_drop_pct": 0.0, + "invalid_bbox_rate_factor": 1.0, + "bbox_collapse_ratio_factor": 1.0, + "score_saturation_factor": 1000000000.0, + "latency_delta_pct": -0.6808276469293418, + "accuracy_delta_pp": null + } + }, + { + "type": "baseline_deviation", + "metric_name": "score_saturation_factor", + "observed_value": 1000000000.0, + "baseline_value": 0.0, + "threshold": 5.0, + "delta": 1.0, + "delta_pct": null, + "increase_factor": 1000000000.0, + "severity": "high", + "status": "failed", + "explanation": "saturation_ratio changed from 0 in the baseline to 1 in the candidate (1000000000x). Review threshold is 5x.", + "why_it_matters": "Score saturation drift can hide ranking failures and may indicate quantization or score decoder mistakes.", + "suspected_causes": [ + "Quantization artifact", + "Duplicated sigmoid/postprocess", + "Incorrect score decoder" + ], + "recommendation": "Review score decoder and quantization calibration.", + "raw_context": { + "baseline_detection_count": 3, + "candidate_detection_count": 5, + "detection_count_delta": 2, + "detection_count_delta_pct": 0.6666666666666666, + "detection_count_drop_pct": 0.0, + "invalid_bbox_rate_factor": 1.0, + "bbox_collapse_ratio_factor": 1.0, + "score_saturation_factor": 1000000000.0, + "latency_delta_pct": -0.6808276469293418, + "accuracy_delta_pp": null + } + }, + { + "type": "detection_count_drift", + "metric_name": "detection_count_drop_pct", + "observed_value": 0.0, + "baseline_value": 3, + "threshold": 0.5, + "delta": 2, + "delta_pct": 0.6666666666666666, + "increase_factor": null, + "severity": "low", + "status": "passed", + "explanation": "Candidate detection count changed from 3 to 5 (0.667 signed delta).", + "why_it_matters": "A large detection count drop can mean the candidate became faster by missing objects rather than by improving execution efficiency.", + "suspected_causes": [], + "recommendation": "Detection count drift is within threshold.", + "raw_context": { + "baseline_detection_count": 3, + "candidate_detection_count": 5, + "detection_count_delta": 2, + "detection_count_delta_pct": 0.6666666666666666, + "detection_count_drop_pct": 0.0, + "invalid_bbox_rate_factor": 1.0, + "bbox_collapse_ratio_factor": 1.0, + "score_saturation_factor": 1000000000.0, + "latency_delta_pct": -0.6808276469293418, + "accuracy_delta_pp": null + } + }, + { + "type": "latency_quality_tradeoff", + "metric_name": "latency_delta_pct", + "observed_value": -0.6808276469293418, + "baseline_value": 45.43, + "threshold": 0, + "delta": -30.93, + "delta_pct": -0.6808276469293418, + "increase_factor": null, + "severity": "high", + "status": "failed", + "explanation": "Candidate latency improved by 0.681, but baseline comparison evidence indicates output quality risk.", + "why_it_matters": "Latency improvement alone is not deployment-ready evidence when bbox, score, or detection count quality regresses.", + "suspected_causes": [ + "Quantization artifact", + "Decoder mismatch", + "Run configuration mismatch" + ], + "recommendation": "Treat speedup as review evidence until output quality drift is resolved.", + "raw_context": { + "baseline_detection_count": 3, + "candidate_detection_count": 5, + "detection_count_delta": 2, + "detection_count_delta_pct": 0.6666666666666666, + "detection_count_drop_pct": 0.0, + "invalid_bbox_rate_factor": 1.0, + "bbox_collapse_ratio_factor": 1.0, + "score_saturation_factor": 1000000000.0, + "latency_delta_pct": -0.6808276469293418, + "accuracy_delta_pp": null + } + } + ], + "suspected_causes": [ + "Quantization artifact", + "Duplicated sigmoid/postprocess", + "Incorrect score decoder", + "Decoder mismatch", + "Run configuration mismatch" + ], + "recommendations": [ + "Baseline deviation is within threshold.", + "Review score decoder and quantization calibration.", + "Detection count drift is within threshold.", + "Treat speedup as review evidence until output quality drift is resolved." + ], + "thresholds": { + "invalid_bbox_rate_review": 0.05, + "invalid_bbox_rate_blocked": 0.2, + "bbox_collapse_ratio_review": 0.05, + "bbox_collapse_ratio_high": 0.1, + "score_saturation_low_threshold": 0.01, + "score_saturation_high_threshold": 0.99, + "saturation_ratio_review": 0.7, + "saturation_ratio_high": 0.85, + "invalid_bbox_rate_factor_review": 5.0, + "invalid_bbox_rate_factor_blocked": 10.0, + "bbox_collapse_ratio_factor_review": 5.0, + "bbox_collapse_ratio_factor_blocked": 10.0, + "score_saturation_factor_review": 5.0, + "score_saturation_factor_blocked": 10.0, + "detection_count_drop_pct_review": 0.5, + "detection_count_drop_pct_blocked": 0.8 + }, + "baseline_summary": { + "label": "fp32_baseline", + "model": "yolov8n", + "precision": "fp32", + "bbox": { + "total_predictions": 3, + "invalid_bbox_count": 0, + "invalid_bbox_rate": 0.0, + "zero_area_count": 0, + "out_of_bounds_count": 0, + "nan_or_inf_count": 0, + "bbox_collapse_count": 0, + "bbox_collapse_ratio": 0.0, + "boundary_check_enabled": false + }, + "score": { + "total_predictions": 3, + "min_score": 0.63, + "max_score": 0.91, + "mean_score": 0.7600000000000001, + "std_score": 0.11518101695447332, + "low_confidence_ratio": 0.0, + "high_confidence_ratio": 0.0, + "saturation_ratio": 0.0, + "score_range_violation_count": 0, + "score_nan_or_inf_count": 0, + "low_threshold": 0.01, + "high_threshold": 0.99 + }, + "latency_ms": 45.43, + "accuracy": null + }, + "candidate_summary": { + "label": "int8_score_saturation", + "model": "yolov8n", + "precision": "int8", + "bbox": { + "total_predictions": 5, + "invalid_bbox_count": 0, + "invalid_bbox_rate": 0.0, + "zero_area_count": 0, + "out_of_bounds_count": 0, + "nan_or_inf_count": 0, + "bbox_collapse_count": 0, + "bbox_collapse_ratio": 0.0, + "boundary_check_enabled": false + }, + "score": { + "total_predictions": 5, + "min_score": 0.002, + "max_score": 0.999, + "mean_score": 0.5993999999999999, + "std_score": 0.48696061442379507, + "low_confidence_ratio": 0.4, + "high_confidence_ratio": 0.6, + "saturation_ratio": 1.0, + "score_range_violation_count": 0, + "score_nan_or_inf_count": 0, + "low_threshold": 0.01, + "high_threshold": 0.99 + }, + "latency_ms": 14.5, + "accuracy": null, + "comparison": { + "baseline_detection_count": 3, + "candidate_detection_count": 5, + "detection_count_delta": 2, + "detection_count_delta_pct": 0.6666666666666666, + "detection_count_drop_pct": 0.0, + "invalid_bbox_rate_factor": 1.0, + "bbox_collapse_ratio_factor": 1.0, + "score_saturation_factor": 1000000000.0, + "latency_delta_pct": -0.6808276469293418, + "accuracy_delta_pp": null + } + }, + "created_at": "2026-05-02T12:54:53Z" + } + }, + { + "case_id": "temporal_instability_review", + "title": "Temporal instability", + "category": "temporal_consistency", + "summary": "Frame-level detection count variance is high enough to require review before deployment.", + "guard_analysis": { + "schema_version": "inferedge-aiguard-diagnosis-v1", + "source": { + "sequence_path": "examples/portfolio_demo/temporal_instability" + }, + "guard_verdict": "review_required", + "severity": "medium", + "confidence": 0.88, + "primary_reason": "Temporal consistency should be reviewed before deployment.", + "evidence": [ + { + "type": "temporal_consistency", + "metric_name": "frame_to_frame_detection_count_cv", + "observed_value": 1.224744871391589, + "baseline_value": null, + "threshold": 1.0, + "delta": null, + "delta_pct": null, + "increase_factor": null, + "severity": "medium", + "status": "warning", + "explanation": "Frame-to-frame detection count CV is 1.225. Review threshold is 1.", + "why_it_matters": "Large detection count variance can indicate unstable output across adjacent frames in the same scene.", + "suspected_causes": [ + "Temporal instability", + "Preprocessing/postprocess mismatch", + "Confidence threshold instability" + ], + "recommendation": "Review frame sequence output before deployment.", + "raw_context": { + "frame_count": 5, + "detection_counts": [ + 0, + 10, + 0, + 10, + 0 + ], + "mean_detection_count": 4.0, + "std_detection_count": 4.898979485566356, + "frame_to_frame_detection_count_cv": 1.224744871391589, + "zero_detection_frame_count": 3, + "zero_detection_frame_ratio": 0.6, + "bbox_center_jump_mean": 0.0, + "bbox_center_jump_p95": 0.0, + "bbox_center_jump_unit": "pixels", + "class_flip_rate": 0.0, + "dominant_classes": [ + null, + 0, + null, + 0, + null + ] + } + }, + { + "type": "temporal_consistency", + "metric_name": "zero_detection_frame_ratio", + "observed_value": 0.6, + "baseline_value": null, + "threshold": 1.0, + "delta": null, + "delta_pct": null, + "increase_factor": null, + "severity": "low", + "status": "passed", + "explanation": "Zero-detection frame ratio is 0.6. Blocked threshold is 1.", + "why_it_matters": "Frequent zero-detection frames can mean objects disappear across a sequence even when individual frames appear valid.", + "suspected_causes": [], + "recommendation": "Zero-detection frame ratio is within threshold.", + "raw_context": { + "frame_count": 5, + "detection_counts": [ + 0, + 10, + 0, + 10, + 0 + ], + "mean_detection_count": 4.0, + "std_detection_count": 4.898979485566356, + "frame_to_frame_detection_count_cv": 1.224744871391589, + "zero_detection_frame_count": 3, + "zero_detection_frame_ratio": 0.6, + "bbox_center_jump_mean": 0.0, + "bbox_center_jump_p95": 0.0, + "bbox_center_jump_unit": "pixels", + "class_flip_rate": 0.0, + "dominant_classes": [ + null, + 0, + null, + 0, + null + ] + } + }, + { + "type": "temporal_consistency", + "metric_name": "bbox_center_jump_p95", + "observed_value": 0.0, + "baseline_value": null, + "threshold": 0.5, + "delta": null, + "delta_pct": null, + "increase_factor": null, + "severity": "low", + "status": "passed", + "explanation": "BBox center jump p95 is 0 (pixels). Review threshold is 0.5.", + "why_it_matters": "Large bbox center jumps across adjacent frames can indicate unstable geometry without requiring a full tracker.", + "suspected_causes": [], + "recommendation": "BBox center jumps are within threshold.", + "raw_context": { + "frame_count": 5, + "detection_counts": [ + 0, + 10, + 0, + 10, + 0 + ], + "mean_detection_count": 4.0, + "std_detection_count": 4.898979485566356, + "frame_to_frame_detection_count_cv": 1.224744871391589, + "zero_detection_frame_count": 3, + "zero_detection_frame_ratio": 0.6, + "bbox_center_jump_mean": 0.0, + "bbox_center_jump_p95": 0.0, + "bbox_center_jump_unit": "pixels", + "class_flip_rate": 0.0, + "dominant_classes": [ + null, + 0, + null, + 0, + null + ] + } + }, + { + "type": "temporal_consistency", + "metric_name": "class_flip_rate", + "observed_value": 0.0, + "baseline_value": null, + "threshold": 0.3, + "delta": null, + "delta_pct": null, + "increase_factor": null, + "severity": "low", + "status": "passed", + "explanation": "Dominant class flip rate is 0. Review threshold is 0.3.", + "why_it_matters": "Frequent dominant class changes in adjacent non-empty frames can indicate unstable classification in a detection sequence.", + "suspected_causes": [], + "recommendation": "Dominant class stability is within threshold.", + "raw_context": { + "frame_count": 5, + "detection_counts": [ + 0, + 10, + 0, + 10, + 0 + ], + "mean_detection_count": 4.0, + "std_detection_count": 4.898979485566356, + "frame_to_frame_detection_count_cv": 1.224744871391589, + "zero_detection_frame_count": 3, + "zero_detection_frame_ratio": 0.6, + "bbox_center_jump_mean": 0.0, + "bbox_center_jump_p95": 0.0, + "bbox_center_jump_unit": "pixels", + "class_flip_rate": 0.0, + "dominant_classes": [ + null, + 0, + null, + 0, + null + ] + } + } + ], + "suspected_causes": [ + "Temporal instability", + "Preprocessing/postprocess mismatch", + "Confidence threshold instability" + ], + "recommendations": [ + "Review frame sequence output before deployment.", + "Zero-detection frame ratio is within threshold.", + "BBox center jumps are within threshold.", + "Dominant class stability is within threshold." + ], + "thresholds": { + "detection_count_cv_review": 1.0, + "zero_detection_frame_ratio_blocked": 1.0, + "bbox_center_jump_p95_review": 0.5, + "class_flip_rate_review": 0.3 + }, + "baseline_summary": {}, + "candidate_summary": { + "sequence_id": "temporal_instability_demo", + "frame_count": 5, + "temporal": { + "frame_count": 5, + "detection_counts": [ + 0, + 10, + 0, + 10, + 0 + ], + "mean_detection_count": 4.0, + "std_detection_count": 4.898979485566356, + "frame_to_frame_detection_count_cv": 1.224744871391589, + "zero_detection_frame_count": 3, + "zero_detection_frame_ratio": 0.6, + "bbox_center_jump_mean": 0.0, + "bbox_center_jump_p95": 0.0, + "bbox_center_jump_unit": "pixels", + "class_flip_rate": 0.0, + "dominant_classes": [ + null, + 0, + null, + 0, + null + ] + } + }, + "created_at": "2026-05-02T12:54:53Z" + } + } + ] +} diff --git a/inferedgelab/studio/routes.py b/inferedgelab/studio/routes.py index 026025d..8897ada 100644 --- a/inferedgelab/studio/routes.py +++ b/inferedgelab/studio/routes.py @@ -34,6 +34,7 @@ "contract_shape_mismatch_report.json", ) LATENCY_REGRESSION_SUMMARY = "latency_regression_summary.json" +AIGUARD_PORTFOLIO_CASES = "aiguard_portfolio_cases.json" DEMO_JOB_ID = "demo_yolov8n_trt_vs_onnx" STATIC_ASSETS = { "app.js": "application/javascript", @@ -172,6 +173,7 @@ def studio_demo_evidence(request: Request) -> dict[str, Any]: results = [_load_demo_result(file_name) for file_name in DEMO_EVIDENCE_FILES] evaluation_report = _load_demo_evaluation_report() problem_cases = _load_demo_problem_cases() + guard_demo_cases = _load_aiguard_portfolio_cases() imported_results = _get_imported_results(request) imported_results.extend(results) guard_analysis = _build_demo_guard_analysis(results, evaluation_report) @@ -181,7 +183,7 @@ def studio_demo_evidence(request: Request) -> dict[str, Any]: results[1], guard_analysis=guard_analysis, ) - demo_job = _build_demo_job(results, compare, evaluation_report, problem_cases) + demo_job = _build_demo_job(results, compare, evaluation_report, problem_cases, guard_demo_cases) _get_demo_jobs(request)[DEMO_JOB_ID] = demo_job return { "status": "loaded", @@ -194,6 +196,7 @@ def studio_demo_evidence(request: Request) -> dict[str, Any]: "compare": compare, "evaluation_report": evaluation_report, "problem_cases": problem_cases, + "guard_demo_cases": guard_demo_cases, "guard_analysis": guard_analysis, "deployment_decision": compare["deployment_decision"], } @@ -397,6 +400,28 @@ def _load_latency_regression_summary() -> dict[str, Any]: } +def _load_aiguard_portfolio_cases() -> dict[str, Any]: + path = DEMO_EVIDENCE_DIR / AIGUARD_PORTFOLIO_CASES + try: + bundle = json.loads(path.read_text(encoding="utf-8")) + except OSError as exc: + raise HTTPException(status_code=500, detail=f"AIGuard portfolio cases not found: {AIGUARD_PORTFOLIO_CASES}") from exc + except json.JSONDecodeError as exc: + raise HTTPException(status_code=500, detail=f"AIGuard portfolio cases are invalid JSON: {AIGUARD_PORTFOLIO_CASES}") from exc + + cases = bundle.get("cases") if isinstance(bundle, dict) else None + if not isinstance(cases, list): + raise HTTPException(status_code=500, detail=f"AIGuard portfolio cases schema error: {AIGUARD_PORTFOLIO_CASES}") + + return { + "schema_version": bundle.get("schema_version"), + "source": f"examples/studio_demo/{AIGUARD_PORTFOLIO_CASES}", + "scope": bundle.get("scope"), + "case_count": bundle.get("case_count", len(cases)), + "cases": cases, + } + + def _load_problem_report(file_name: str) -> dict[str, Any]: path = VALIDATION_PROBLEM_DIR / file_name try: @@ -429,6 +454,7 @@ def _build_demo_job( compare: dict[str, Any], evaluation_report: dict[str, Any], problem_cases: list[dict[str, Any]], + guard_demo_cases: dict[str, Any], ) -> dict[str, Any]: now = _utc_now_iso() runtime_result = results[-1] if results else {} @@ -450,6 +476,7 @@ def _build_demo_job( "guard_analysis": compare.get("guard_analysis"), "evaluation_report": evaluation_report, "problem_cases": problem_cases, + "guard_demo_cases": guard_demo_cases, "summary": compare["judgement"]["summary"], }, "error": None, diff --git a/inferedgelab/studio/static/app.js b/inferedgelab/studio/static/app.js index 0987777..576ead9 100644 --- a/inferedgelab/studio/static/app.js +++ b/inferedgelab/studio/static/app.js @@ -31,6 +31,7 @@ let importedResult = null; let demoEvaluationReport = null; let demoProblemCases = []; let activeGuardAnalysis = null; +let guardDemoCases = null; const importedResultsByJobId = {}; function createElement(tagName, className, textContent) { @@ -373,6 +374,7 @@ async function loadDemoEvidence() { importedResult = results[results.length - 1] || null; demoEvaluationReport = payload.evaluation_report || null; demoProblemCases = Array.isArray(payload.problem_cases) ? payload.problem_cases : []; + guardDemoCases = payload.guard_demo_cases || null; compareData = payload.compare || null; updateGuardEvidence(payload.guard_analysis || payload.compare?.guard_analysis || null); selectedJobId = payload.job_id || payload.job?.job_id || selectedJobId; @@ -384,6 +386,7 @@ async function loadDemoEvidence() { renderImportEvidence({ result: importedResult }); renderDemoEvaluation(demoEvaluationReport); renderDemoProblemCases(demoProblemCases); + renderGuardDemoCases(guardDemoCases); renderImportedResult(); await loadJobs(selectedJobId); await loadCompare(); @@ -420,6 +423,46 @@ function renderDemoProblemCases(problemCases = []) { }); } +function renderGuardDemoCases(bundle) { + const target = document.querySelector("#guard-demo-cases"); + if (!target) { + return; + } + target.replaceChildren(); + + const cases = Array.isArray(bundle?.cases) ? bundle.cases : []; + if (!cases.length) { + return; + } + + const heading = createElement("div", "guard-demo-heading"); + heading.append( + createElement("p", "caption", "Portfolio diagnosis cases"), + createElement("h3", "", "AIGuard demo evidence"), + createElement( + "p", + "body-text", + "Normal, bbox collapse, score saturation, and temporal instability evidence are replayed from local fixtures.", + ), + ); + target.append(heading); + + cases.forEach((item) => { + const analysis = item.guard_analysis || {}; + const verdict = guardVerdict(analysis); + const card = createElement("article", `guard-demo-card ${decisionTone(verdict)}`); + card.append( + createElement("p", "caption", item.category || "diagnosis"), + createElement("h4", "", item.title || item.case_id || "AIGuard case"), + createElement("p", "body-text", item.summary || analysis.primary_reason || "-"), + evidenceItem("guard_verdict", verdict), + evidenceItem("severity", analysis.severity || "-"), + evidenceItem("primary_metric", primaryGuardMetric(analysis)), + ); + target.append(card); + }); +} + function problemCaseDetail(problem = {}) { if (problem.problem_case_type === "runtime_latency" || problem.latency_checks) { const checks = problem.latency_checks || {}; @@ -523,6 +566,7 @@ function renderRunPanel() { setState("#demo-state", "idle"); renderDemoEvaluation(null); renderDemoProblemCases([]); + renderGuardDemoCases(null); } function resetTransientInputs() { @@ -1001,6 +1045,12 @@ function guardEvidenceItems(guardAnalysis = {}) { return []; } +function primaryGuardMetric(guardAnalysis = {}) { + const evidence = guardEvidenceItems(guardAnalysis); + const primary = evidence.find((item) => item.status === "failed" || item.status === "warning") || evidence[0] || {}; + return primary.metric_name || primary.type || "-"; +} + function decisionReason(decision) { const decisionName = String(decision?.decision || "unknown").toLowerCase(); if (decisionName === "unknown" && !decision?.guard_status) { diff --git a/inferedgelab/studio/static/index.html b/inferedgelab/studio/static/index.html index bd07216..e6aae85 100644 --- a/inferedgelab/studio/static/index.html +++ b/inferedgelab/studio/static/index.html @@ -137,8 +137,8 @@ } } - - + +
@@ -320,6 +320,7 @@

AIGuard Evidence

+
@@ -344,7 +345,7 @@

Future Work

- - + + diff --git a/inferedgelab/studio/static/style.css b/inferedgelab/studio/static/style.css index 4c1c431..0fd29d7 100644 --- a/inferedgelab/studio/static/style.css +++ b/inferedgelab/studio/static/style.css @@ -767,6 +767,49 @@ body.file-mode .file-protocol-warning { margin-top: 4px; } +.guard-demo-grid { + display: grid; + grid-template-columns: repeat(4, minmax(0, 1fr)); + gap: 10px; + margin-top: 12px; +} + +.guard-demo-heading { + grid-column: 1 / -1; +} + +.guard-demo-heading h3 { + margin: 4px 0 6px; +} + +.guard-demo-card { + border: 1px solid var(--line); + border-radius: 10px; + background: rgba(15, 23, 42, 0.78); + padding: 12px; +} + +.guard-demo-card h4 { + margin: 4px 0 8px; + font-size: 0.95rem; +} + +.guard-demo-card .evidence-item { + margin-top: 8px; +} + +.guard-demo-card.deployable { + border-color: rgba(34, 197, 94, 0.32); +} + +.guard-demo-card.review { + border-color: rgba(234, 179, 8, 0.35); +} + +.guard-demo-card.blocked { + border-color: rgba(239, 68, 68, 0.35); +} + .decision-card h3 { margin-top: 8px; font-size: clamp(2rem, 4vw, 3.25rem); @@ -850,7 +893,8 @@ body.file-mode .file-protocol-warning { .guard-summary, .guard-source, - .guard-row { + .guard-row, + .guard-demo-grid { grid-template-columns: 1fr; } diff --git a/tests/test_studio_routes.py b/tests/test_studio_routes.py index b3dc8ba..521d33e 100644 --- a/tests/test_studio_routes.py +++ b/tests/test_studio_routes.py @@ -60,10 +60,10 @@ def test_studio_route_returns_local_studio_html(): assert "Import" in html assert "Jetson Helper" in html assert 'data-critical="studio-dark"' in html - assert 'href="/studio/static/style.css?v=18"' in html - assert 'href="style.css?v=18"' in html - assert 'src="/studio/static/app.js?v=18"' in html - assert 'src="app.js?v=18"' in html + assert 'href="/studio/static/style.css?v=19"' in html + assert 'href="style.css?v=19"' in html + assert 'src="/studio/static/app.js?v=19"' in html + assert 'src="app.js?v=19"' in html assert "file-protocol-warning" in html assert 'placeholder="results/latest.json"' in html assert 'value="results/latest.json"' not in html @@ -76,6 +76,7 @@ def test_studio_route_returns_local_studio_html(): assert "Lab's local gate" in html assert "AIGuard Evidence" in html assert 'id="guard-evidence-panel"' in html + assert 'id="guard-demo-cases"' in html assert "Load Demo Evidence" in html assert 'id="demo-state"' in html assert 'id="demo-report-summary"' in html @@ -133,7 +134,9 @@ def test_studio_static_assets_include_redesigned_ui_contracts(): assert "renderDemoEvaluation" in app_text assert "renderDemoProblemCases" in app_text assert "renderGuardEvidence" in app_text + assert "renderGuardDemoCases" in app_text assert "guardEvidenceItems" in app_text + assert "primaryGuardMetric" in app_text assert "guard_verdict" in app_text assert "/studio/api/demo-evidence" in app_text assert "jobDisplayName" in app_text @@ -154,6 +157,8 @@ def test_studio_static_assets_include_redesigned_ui_contracts(): assert ".demo-report-summary" in style_text assert ".problem-case-grid" in style_text assert ".guard-panel" in style_text + assert ".guard-demo-grid" in style_text + assert ".guard-demo-card" in style_text assert ".guard-evidence-table" in style_text assert ".guard-row" in style_text assert ".compare-stat-list" in style_text @@ -361,6 +366,13 @@ def test_studio_demo_evidence_loads_compare_ready_pair(): assert response["compare"]["judgement"]["overall"] == "improvement" assert response["guard_analysis"]["guard_verdict"] == "review_required" assert response["guard_analysis"]["evidence"][0]["metric_name"] == "map50" + assert response["guard_demo_cases"]["schema_version"] == "inferedge-aiguard-portfolio-demo-v1" + assert response["guard_demo_cases"]["case_count"] == 4 + guard_cases = {case["case_id"]: case for case in response["guard_demo_cases"]["cases"]} + assert guard_cases["normal_pass"]["guard_analysis"]["guard_verdict"] == "pass" + assert guard_cases["bbox_collapse_blocked"]["guard_analysis"]["guard_verdict"] == "blocked" + assert guard_cases["score_saturation_blocked"]["guard_analysis"]["guard_verdict"] == "blocked" + assert guard_cases["temporal_instability_review"]["guard_analysis"]["guard_verdict"] == "review_required" assert response["deployment_decision"]["decision"] == "review_required" assert response["deployment_decision"]["guard_verdict"] == "review_required" assert response["evaluation_report"]["preset"] == "yolov8_coco" @@ -406,6 +418,7 @@ def test_studio_demo_evidence_is_listed_and_selectable_as_job(): assert detail["status"] == "completed" assert detail["result"]["runtime_result"]["backend_key"] == "tensorrt__jetson" assert detail["result"]["guard_analysis"]["guard_verdict"] == "review_required" + assert detail["result"]["guard_demo_cases"]["case_count"] == 4 assert detail["result"]["comparison"]["base"]["backend_key"] == "onnxruntime__cpu" assert detail["result"]["comparison"]["new"]["backend_key"] == "tensorrt__jetson" assert detail["result"]["evaluation_report"]["accuracy"]["metrics"]["precision"] > 0