From 25cd873f62951e0f96b52bb446925e5d9b0992b4 Mon Sep 17 00:00:00 2001
From: Wang Yang <yang4.wang@intel.com>
Date: Fri, 22 May 2026 11:28:16 +0800
Subject: [PATCH 1/3] Fix gpt-oss accuracy issue

---
 ggml/src/ggml-openvino/ggml-openvino.cpp | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index 1cfbfe0af8e..2aa8798ee7a 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -919,6 +919,16 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
             // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n");
             return true;
         }
+
+        // GPU execution of the MoE routing weights softmax is numerically unstable
+        // when fused with the surrounding GET_ROWS/reshape path. Keep this softmax
+        // on CPU so the scheduler splits at the same boundary that restores parity.
+        if (ggml_openvino_get_device_name() == "GPU" &&
+            op->src[0] != nullptr && op->src[0]->op == GGML_OP_RESHAPE &&
+            op->src[0]->src[0] != nullptr &&
+            strncmp(op->src[0]->src[0]->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) {
+            return true;
+        }
         break;
     }
     case GGML_OP_SUM_ROWS: {
@@ -966,6 +976,11 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         break;
     }
     case GGML_OP_PERMUTE: {
+        if (ggml_openvino_get_device_name() == "GPU" && op->src[0] != nullptr && op->src[0]->op == GGML_OP_VIEW &&
+            op->src[0]->src[0] != nullptr && op->src[0]->src[0]->op == GGML_OP_NONE &&
+            !ggml_is_contiguous(op->src[0])) {
+            return true;
+        }
         if (op->type == GGML_TYPE_BF16) {
             // err msg: [GPU] Could not find a suitable kernel for transpose
             // GGML_LOG_WARN("OpenVINO backend does not support PERMUTE with BF16 type\n");
@@ -987,6 +1002,12 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         break;
     }
     case GGML_OP_MUL_MAT: {
+        if (ggml_openvino_get_device_name() == "GPU" && op->src[1]->op == GGML_OP_SOFT_MAX &&
+            op->src[0]->op == GGML_OP_CONT && op->src[0]->src[0] != nullptr &&
+            op->src[0]->src[0]->op == GGML_OP_TRANSPOSE && op->src[0]->src[0]->src[0] != nullptr &&
+            op->src[0]->src[0]->src[0]->op == GGML_OP_PERMUTE) {
+            return true;
+        }
         if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) {
             // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"`
             // GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n");

From 5dd95eaedef16ba1c848abfe328748558628851b Mon Sep 17 00:00:00 2001
From: Xuejun <Xuejun.Zhai@intel.com>
Date: Sun, 24 May 2026 09:07:03 +0530
Subject: [PATCH 2/3] OpenVINO backend: enable arctic for arch test

---
 ggml/src/ggml-openvino/ggml-openvino.cpp | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index 2aa8798ee7a..08dafa28e14 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -871,6 +871,23 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
             // ERR = 0.000000197 > 0.000000100   GET_ROWS(type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
             return true;
         }
+
+        // Keep the MoE routing weights gather on CPU for GPU runs. Splitting
+        // only at the later SUM/CLAMP/DIV nodes still leaves this routing path
+        // numerically unstable for arctic-style MoE graphs.
+        if (ggml_openvino_get_device_name() == "GPU" &&
+            strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) {
+            return true;
+        }
+        break;
+    }
+    case GGML_OP_RESHAPE: {
+        if (ggml_openvino_get_device_name() == "GPU") {
+            if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0 ||
+                strncmp(op->name, "ffn_norm_exps", sizeof("ffn_norm_exps") - 1) == 0) {
+                return true;
+            }
+        }
         break;
     }
     case GGML_OP_ADD:

From 66655624d56ddbdcf4354265e85052c2d1857a46 Mon Sep 17 00:00:00 2001
From: Xuejun <Xuejun.Zhai@intel.com>
Date: Mon, 25 May 2026 12:33:41 +0530
Subject: [PATCH 3/3] OpenVINO backend: enable grok for arch test

---
 ggml/src/ggml-openvino/ggml-openvino.cpp | 46 +++++++++++++-----------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index 08dafa28e14..f224ccdb522 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -875,18 +875,15 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         // Keep the MoE routing weights gather on CPU for GPU runs. Splitting
         // only at the later SUM/CLAMP/DIV nodes still leaves this routing path
         // numerically unstable for arctic-style MoE graphs.
-        if (ggml_openvino_get_device_name() == "GPU" &&
-            strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) {
+        if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) {
             return true;
         }
         break;
     }
     case GGML_OP_RESHAPE: {
-        if (ggml_openvino_get_device_name() == "GPU") {
-            if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0 ||
-                strncmp(op->name, "ffn_norm_exps", sizeof("ffn_norm_exps") - 1) == 0) {
-                return true;
-            }
+        if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0 ||
+            strncmp(op->name, "ffn_norm_exps", sizeof("ffn_norm_exps") - 1) == 0) {
+            return true;
         }
         break;
     }
@@ -925,8 +922,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
 
         // qwen3next MoE weight normalization is numerically sensitive on the GPU
         // path. Keep the normalization divide on CPU to match the reference.
-        if (ggml_openvino_get_device_name() == "GPU" &&
-            strncmp(op->name, "ffn_moe_weights_norm", sizeof("ffn_moe_weights_norm") - 1) == 0) {
+        if (strncmp(op->name, "ffn_moe_weights_norm", sizeof("ffn_moe_weights_norm") - 1) == 0) {
             return true;
         }
         break;
@@ -937,11 +933,14 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
             return true;
         }
 
+        if (strncmp(op->name, "ffn_moe_probs", sizeof("ffn_moe_probs") - 1) == 0) {
+            return true;
+        }
+
         // GPU execution of the MoE routing weights softmax is numerically unstable
         // when fused with the surrounding GET_ROWS/reshape path. Keep this softmax
         // on CPU so the scheduler splits at the same boundary that restores parity.
-        if (ggml_openvino_get_device_name() == "GPU" &&
-            op->src[0] != nullptr && op->src[0]->op == GGML_OP_RESHAPE &&
+        if (op->src[0] != nullptr && op->src[0]->op == GGML_OP_RESHAPE &&
             op->src[0]->src[0] != nullptr &&
             strncmp(op->src[0]->src[0]->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) {
             return true;
@@ -949,8 +948,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         break;
     }
     case GGML_OP_SUM_ROWS: {
-        if (ggml_openvino_get_device_name() == "GPU" &&
-            strncmp(op->name, "ffn_moe_weights_sum", sizeof("ffn_moe_weights_sum") - 1) == 0) {
+        if (strncmp(op->name, "ffn_moe_weights_sum", sizeof("ffn_moe_weights_sum") - 1) == 0) {
             return true;
         }
 
@@ -961,13 +959,16 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
          break;
     }
     case GGML_OP_CLAMP: {
-        if (ggml_openvino_get_device_name() == "GPU" &&
-            strncmp(op->name, "ffn_moe_weights_sum_clamped", sizeof("ffn_moe_weights_sum_clamped") - 1) == 0) {
+        if (strncmp(op->name, "ffn_moe_weights_sum_clamped", sizeof("ffn_moe_weights_sum_clamped") - 1) == 0) {
             return true;
         }
         break;
     }
     case GGML_OP_FLASH_ATTN_EXT: {
+        // qwen3next currently shows large accuracy drift in OpenVINO flash attention.
+        // Keep FLASH_ATTN_EXT on CPU until parity is restored.
+        // return true;
+
         if (op->src[4] != nullptr) {
             // GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n");
             return true;
@@ -993,11 +994,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         break;
     }
     case GGML_OP_PERMUTE: {
-        if (ggml_openvino_get_device_name() == "GPU" && op->src[0] != nullptr && op->src[0]->op == GGML_OP_VIEW &&
-            op->src[0]->src[0] != nullptr && op->src[0]->src[0]->op == GGML_OP_NONE &&
-            !ggml_is_contiguous(op->src[0])) {
-            return true;
-        }
         if (op->type == GGML_TYPE_BF16) {
             // err msg: [GPU] Could not find a suitable kernel for transpose
             // GGML_LOG_WARN("OpenVINO backend does not support PERMUTE with BF16 type\n");
@@ -1044,6 +1040,11 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         break;
     }
     case GGML_OP_MUL_MAT_ID: {
+        if (strncmp(op->name, "ffn_moe_gate_up", sizeof("ffn_moe_gate_up") - 1) == 0 ||
+            strncmp(op->name, "ffn_moe_down", sizeof("ffn_moe_down") - 1) == 0) {
+            return true;
+        }
+
         if (mul_mat_id_requires_large_tmp(op)) {
             return true;
         }
@@ -1116,6 +1117,11 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         }
         break;
     }
+    case GGML_OP_SSM_CONV: {
+        // qwen3next is numerically unstable with OpenVINO SSM_CONV.
+        // Keep this op on CPU until the OpenVINO implementation is fixed.
+        return true;
+    }
     default:
         break;
     }