cactus-compute · Shiv716 · Feb 21, 2026 · Feb 21, 2026 · Feb 21, 2026 · Feb 21, 2026
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -212,4 +212,9 @@ cactus
 server/
 
 # Leaderboard data
-docs/
+docs/
+
+# Alternate benchmark (local test set)
+benchmark_alt.py
+benchmark_v3.py
+benchmark_v4.py
diff --git a/benchmark.py b/benchmark.py
@@ -406,6 +406,13 @@ def run_benchmark(benchmarks=None):
         print(f"[{i}/{total}] Running: {case['name']} ({case['difficulty']})...", end=" ", flush=True)
         result = generate_hybrid(case["messages"], case["tools"])
         f1 = compute_f1(result["function_calls"], case["expected_calls"])
+
+        if f1 == 0.0 and result.get("source") == "on-device":
+            print(f"\n--- DEBUG {case['name']} ---")
+            print(f"EXPECTED: {case['expected_calls']}")
+            print(f"ACTUAL:   {result['function_calls']}")
+            print("------------------------\n")
+
         source = result.get("source", "unknown")
         print(f"F1={f1:.2f} | {result['total_time_ms']:.0f}ms | {source}")
         results.append({

diff --git a/info.md b/info.md
@@ -0,0 +1,30 @@
+• Implemented: I changed only the hybrid routing logic in main.py to a validation-first strategy instead of raw confidence gating.
+
+  What changed in generate_hybrid:
+
+  - main.py:97 now does:
+
+  1. Run on-device first.
+  2. Infer expected intents from user text (weather, alarm, timer, message, reminder, contacts, play music).
+  3. Validate local output against tool schema (required args + type checks).
+  4. Validate semantic consistency against extracted slots (time, minutes, location, recipient/message, reminder title/time, contact query, song).
+  5. Apply dynamic confidence thresholds by complexity:
+      - single-intent: 0.80
+      - two intents: 0.92
+      - three+ intents: 0.96
+  6. Fallback to cloud only if local output fails semantic/schema checks or confidence threshold.
+
+  Why this should improve your benchmark:
+
+  - It explicitly catches the failure mode you showed: high-confidence but wrong local calls (especially timer/reminder/multi-call hard tasks).
+  - It preserves local speed/ratio when local calls are actually coherent.
+
+  Validation status:
+
+  - Syntax check passed: python -m py_compile main.py.
+  - Full benchmark could not be completed in this environment due blocked network access for Gemini fallback (httpx.ConnectError), so I couldn’t produce a
+    new score locally.
+
+  Next step to verify on your side:
+
+  1. Run python benchmark.py with working Gemini connectivity and compare score/F1 breakdown vs your current 50.0%.