From 25cd873f62951e0f96b52bb446925e5d9b0992b4 Mon Sep 17 00:00:00 2001 From: Wang Yang Date: Fri, 22 May 2026 11:28:16 +0800 Subject: [PATCH 1/3] Fix gpt-oss accuracy issue --- ggml/src/ggml-openvino/ggml-openvino.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 1cfbfe0af8e..2aa8798ee7a 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -919,6 +919,16 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n"); return true; } + + // GPU execution of the MoE routing weights softmax is numerically unstable + // when fused with the surrounding GET_ROWS/reshape path. Keep this softmax + // on CPU so the scheduler splits at the same boundary that restores parity. + if (ggml_openvino_get_device_name() == "GPU" && + op->src[0] != nullptr && op->src[0]->op == GGML_OP_RESHAPE && + op->src[0]->src[0] != nullptr && + strncmp(op->src[0]->src[0]->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) { + return true; + } break; } case GGML_OP_SUM_ROWS: { @@ -966,6 +976,11 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { break; } case GGML_OP_PERMUTE: { + if (ggml_openvino_get_device_name() == "GPU" && op->src[0] != nullptr && op->src[0]->op == GGML_OP_VIEW && + op->src[0]->src[0] != nullptr && op->src[0]->src[0]->op == GGML_OP_NONE && + !ggml_is_contiguous(op->src[0])) { + return true; + } if (op->type == GGML_TYPE_BF16) { // err msg: [GPU] Could not find a suitable kernel for transpose // GGML_LOG_WARN("OpenVINO backend does not support PERMUTE with BF16 type\n"); @@ -987,6 +1002,12 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { break; } case GGML_OP_MUL_MAT: { + if (ggml_openvino_get_device_name() == "GPU" && op->src[1]->op == GGML_OP_SOFT_MAX && + op->src[0]->op == GGML_OP_CONT && op->src[0]->src[0] != nullptr && + op->src[0]->src[0]->op == GGML_OP_TRANSPOSE && op->src[0]->src[0]->src[0] != nullptr && + op->src[0]->src[0]->src[0]->op == GGML_OP_PERMUTE) { + return true; + } if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) { // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"` // GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n"); From 5dd95eaedef16ba1c848abfe328748558628851b Mon Sep 17 00:00:00 2001 From: Xuejun Date: Sun, 24 May 2026 09:07:03 +0530 Subject: [PATCH 2/3] OpenVINO backend: enable arctic for arch test --- ggml/src/ggml-openvino/ggml-openvino.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 2aa8798ee7a..08dafa28e14 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -871,6 +871,23 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { // ERR = 0.000000197 > 0.000000100 GET_ROWS(type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0) return true; } + + // Keep the MoE routing weights gather on CPU for GPU runs. Splitting + // only at the later SUM/CLAMP/DIV nodes still leaves this routing path + // numerically unstable for arctic-style MoE graphs. + if (ggml_openvino_get_device_name() == "GPU" && + strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) { + return true; + } + break; + } + case GGML_OP_RESHAPE: { + if (ggml_openvino_get_device_name() == "GPU") { + if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0 || + strncmp(op->name, "ffn_norm_exps", sizeof("ffn_norm_exps") - 1) == 0) { + return true; + } + } break; } case GGML_OP_ADD: From 66655624d56ddbdcf4354265e85052c2d1857a46 Mon Sep 17 00:00:00 2001 From: Xuejun Date: Mon, 25 May 2026 12:33:41 +0530 Subject: [PATCH 3/3] OpenVINO backend: enable grok for arch test --- ggml/src/ggml-openvino/ggml-openvino.cpp | 46 +++++++++++++----------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 08dafa28e14..f224ccdb522 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -875,18 +875,15 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { // Keep the MoE routing weights gather on CPU for GPU runs. Splitting // only at the later SUM/CLAMP/DIV nodes still leaves this routing path // numerically unstable for arctic-style MoE graphs. - if (ggml_openvino_get_device_name() == "GPU" && - strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) { + if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) { return true; } break; } case GGML_OP_RESHAPE: { - if (ggml_openvino_get_device_name() == "GPU") { - if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0 || - strncmp(op->name, "ffn_norm_exps", sizeof("ffn_norm_exps") - 1) == 0) { - return true; - } + if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0 || + strncmp(op->name, "ffn_norm_exps", sizeof("ffn_norm_exps") - 1) == 0) { + return true; } break; } @@ -925,8 +922,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { // qwen3next MoE weight normalization is numerically sensitive on the GPU // path. Keep the normalization divide on CPU to match the reference. - if (ggml_openvino_get_device_name() == "GPU" && - strncmp(op->name, "ffn_moe_weights_norm", sizeof("ffn_moe_weights_norm") - 1) == 0) { + if (strncmp(op->name, "ffn_moe_weights_norm", sizeof("ffn_moe_weights_norm") - 1) == 0) { return true; } break; @@ -937,11 +933,14 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { return true; } + if (strncmp(op->name, "ffn_moe_probs", sizeof("ffn_moe_probs") - 1) == 0) { + return true; + } + // GPU execution of the MoE routing weights softmax is numerically unstable // when fused with the surrounding GET_ROWS/reshape path. Keep this softmax // on CPU so the scheduler splits at the same boundary that restores parity. - if (ggml_openvino_get_device_name() == "GPU" && - op->src[0] != nullptr && op->src[0]->op == GGML_OP_RESHAPE && + if (op->src[0] != nullptr && op->src[0]->op == GGML_OP_RESHAPE && op->src[0]->src[0] != nullptr && strncmp(op->src[0]->src[0]->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) { return true; @@ -949,8 +948,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { break; } case GGML_OP_SUM_ROWS: { - if (ggml_openvino_get_device_name() == "GPU" && - strncmp(op->name, "ffn_moe_weights_sum", sizeof("ffn_moe_weights_sum") - 1) == 0) { + if (strncmp(op->name, "ffn_moe_weights_sum", sizeof("ffn_moe_weights_sum") - 1) == 0) { return true; } @@ -961,13 +959,16 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { break; } case GGML_OP_CLAMP: { - if (ggml_openvino_get_device_name() == "GPU" && - strncmp(op->name, "ffn_moe_weights_sum_clamped", sizeof("ffn_moe_weights_sum_clamped") - 1) == 0) { + if (strncmp(op->name, "ffn_moe_weights_sum_clamped", sizeof("ffn_moe_weights_sum_clamped") - 1) == 0) { return true; } break; } case GGML_OP_FLASH_ATTN_EXT: { + // qwen3next currently shows large accuracy drift in OpenVINO flash attention. + // Keep FLASH_ATTN_EXT on CPU until parity is restored. + // return true; + if (op->src[4] != nullptr) { // GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n"); return true; @@ -993,11 +994,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { break; } case GGML_OP_PERMUTE: { - if (ggml_openvino_get_device_name() == "GPU" && op->src[0] != nullptr && op->src[0]->op == GGML_OP_VIEW && - op->src[0]->src[0] != nullptr && op->src[0]->src[0]->op == GGML_OP_NONE && - !ggml_is_contiguous(op->src[0])) { - return true; - } if (op->type == GGML_TYPE_BF16) { // err msg: [GPU] Could not find a suitable kernel for transpose // GGML_LOG_WARN("OpenVINO backend does not support PERMUTE with BF16 type\n"); @@ -1044,6 +1040,11 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { break; } case GGML_OP_MUL_MAT_ID: { + if (strncmp(op->name, "ffn_moe_gate_up", sizeof("ffn_moe_gate_up") - 1) == 0 || + strncmp(op->name, "ffn_moe_down", sizeof("ffn_moe_down") - 1) == 0) { + return true; + } + if (mul_mat_id_requires_large_tmp(op)) { return true; } @@ -1116,6 +1117,11 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { } break; } + case GGML_OP_SSM_CONV: { + // qwen3next is numerically unstable with OpenVINO SSM_CONV. + // Keep this op on CPU until the OpenVINO implementation is fixed. + return true; + } default: break; }