From ed986aa39c2b596c085c4d7d9098a3db6b929513 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Thu, 21 May 2026 15:42:32 -0700 Subject: [PATCH 1/5] Initiall gemma4 npu support --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 +- ggml/src/ggml-openvino/ggml-decoder.h | 4 + ggml/src/ggml-openvino/openvino/decoder.h | 2 + .../src/ggml-openvino/openvino/node_context.h | 24 +++++ ggml/src/ggml-openvino/openvino/op/cont.cpp | 2 +- ggml/src/ggml-openvino/openvino/op/view.cpp | 98 ++++++++++++++++++- ggml/src/ggml-openvino/openvino/utils.cpp | 95 ++++++++++++++++++ 7 files changed, 225 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index d005b40458f..f5054641615 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -1131,7 +1131,7 @@ ov::PartialShape GgmlOvDecoder::get_view_input_ov_shape(int node_idx, const std: if (dynamic_it != m_node_dynamic_dims.end() && dynamic_it->second != -1) { int dynamic_dim_index = dynamic_it->second; // GGML uses reverse indexing, so convert to OpenVINO indexing - shape[3 - dynamic_dim_index] = -1; + shape[3 - dynamic_dim_index] = m_is_static ? get_static_n_tokens() : -1; } return shape; @@ -1154,7 +1154,7 @@ ov::PartialShape GgmlOvDecoder::get_view_input_src_ov_shape(int node_idx, const if (dynamic_it != m_node_dynamic_dims.end() && dynamic_it->second != -1) { int dynamic_dim_index = dynamic_it->second; // GGML uses reverse indexing, so convert to OpenVINO indexing - shape[3 - dynamic_dim_index] = -1; + shape[3 - dynamic_dim_index] = m_is_static ? get_static_n_tokens() : -1; } return shape; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 91850a000b5..35bed0ba476 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -206,6 +206,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual bool is_stateful() const override { return m_is_stateful; } + virtual int get_static_n_tokens() const override { + return m_is_prefill ? m_prefill_chunk_size : 1; + } + virtual bool is_splited_model() const override { return m_model_is_splitted; } diff --git a/ggml/src/ggml-openvino/openvino/decoder.h b/ggml/src/ggml-openvino/openvino/decoder.h index bc41876875c..c602aae73d7 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.h +++ b/ggml/src/ggml-openvino/openvino/decoder.h @@ -101,6 +101,8 @@ class GgmlDecoder : public DecoderBase { virtual int is_swa_layer(int layer) const = 0; virtual int32_t get_op_dynamic_dim(int node_idx) const = 0; + + virtual int get_static_n_tokens() const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.h b/ggml/src/ggml-openvino/openvino/node_context.h index 2402a74a908..8e834caa422 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.h +++ b/ggml/src/ggml-openvino/openvino/node_context.h @@ -125,6 +125,16 @@ class NodeContext : public frontend::NodeContext { if (view_input_size > 0) { // This is a VIEW input, get the base tensor name (last element in the chain) std::string base_name = m_decoder->get_view_input_src_name(m_node_idx, m_input_names[idx], view_input_size - 1); + // Check if the VIEW has been resolved (translate_view produced a Slice) + auto view_it = m_tensor_map->find(m_input_names[idx]); + if (!base_name.empty() && view_it != m_tensor_map->end()) { + auto base_it = m_tensor_map->find(base_name); + if (base_it != m_tensor_map->end() && + view_it->second.get_node_shared_ptr() != base_it->second.get_node_shared_ptr()) { + return view_it->second; + } + return base_it->second; + } if (!base_name.empty()) { return m_tensor_map->at(base_name); } @@ -133,6 +143,18 @@ class NodeContext : public frontend::NodeContext { return m_tensor_map->at(m_input_names[idx]); } + void cache_tensor(const std::string& name, const Output& tensor) const { + (*m_tensor_map)[name] = tensor; + } + + Output get_cached_tensor(const std::string& name) const { + auto it = m_tensor_map->find(name); + if (it != m_tensor_map->end()) { + return it->second; + } + return Output(); + } + Output get_input(const std::string& name) const override { if (m_tensor_map->find(name) == m_tensor_map->end()) { throw std::runtime_error("'" + name + "' not found in tensor map."); @@ -160,6 +182,8 @@ class NodeContext : public frontend::NodeContext { bool is_stateful() const { return m_decoder->is_stateful(); } + int get_static_n_tokens() const { return m_decoder->get_static_n_tokens(); } + private: std::shared_ptr m_decoder; std::shared_ptr& m_tensor_map; diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index 1d6cc672126..fed72cbfb93 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -22,7 +22,7 @@ OutputVector translate_cont(const NodeContext & context) { auto dst_shape = context.get_output_shape().to_shape(); if (context.get_op_dynamic_dim() != -1) { - dst_shape[3 - context.get_op_dynamic_dim()] = -1; + dst_shape[3 - context.get_op_dynamic_dim()] = context.is_static() ? context.get_static_n_tokens() : -1; } auto input = process_view_input_new(context, 0); diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index 7d777291939..33ea8517c88 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -1,6 +1,8 @@ #include "../op_table.h" #include "../utils.h" +#include #include +#include #include namespace ov { namespace frontend { @@ -9,7 +11,101 @@ namespace op { OutputVector translate_view(const NodeContext & context) { num_inputs_check(context, 1, 1); - return {context.get_input(0)}; + + if (!context.is_static()) { + return {context.get_input(0)}; + } + + auto input = context.get_input(0); + auto src_shape = context.get_input_shape(0); + auto dst_shape = context.get_output_shape(); + + if (src_shape.rank().is_dynamic() || dst_shape.rank().is_dynamic()) { + return {input}; + } + + int64_t src_elems = 1, dst_elems = 1; + for (int64_t i = 0; i < src_shape.rank().get_length(); ++i) { + if (src_shape[i].is_dynamic()) return {input}; + src_elems *= src_shape[i].get_length(); + } + for (int64_t i = 0; i < dst_shape.rank().get_length(); ++i) { + if (dst_shape[i].is_dynamic()) return {input}; + dst_elems *= dst_shape[i].get_length(); + } + + if (dst_elems >= src_elems) { + return {input}; + } + + auto src_stride = context.get_input_stride(0); + auto dst_stride = context.get_output_stride(); + size_t view_offset = context.get_output_op_offset(); + + bool same_stride = (src_stride.size() == dst_stride.size()); + if (same_stride) { + for (size_t i = 0; i < src_stride.size(); ++i) { + if (src_stride[i] != dst_stride[i]) { + same_stride = false; + break; + } + } + } + + if (!same_stride) { + return {input}; + } + + auto src_ov_shape = src_shape.to_shape(); + auto dst_ov_shape = dst_shape.to_shape(); + size_t ndims = src_ov_shape.size(); + if (dst_ov_shape.size() != ndims) { + return {input}; + } + + std::vector diff_dims; + for (size_t i = 0; i < ndims; ++i) { + if (src_ov_shape[i] != dst_ov_shape[i]) { + diff_dims.push_back(static_cast(i)); + } + } + + if (diff_dims.size() != 1) { + return {input}; + } + + int slice_dim = diff_dims[0]; + int64_t dim_size = static_cast(src_ov_shape[slice_dim]); + + size_t stride_at_dim = (slice_dim < static_cast(ndims) - 1) ? + src_stride[slice_dim + 1] : src_stride[slice_dim]; + + size_t ov_stride_for_dim = 1; + for (size_t i = slice_dim + 1; i < ndims; ++i) { + ov_stride_for_dim *= src_ov_shape[i]; + } + size_t elem_size = src_stride.back(); + if (elem_size == 0) elem_size = 1; + + int64_t begin_val = 0; + if (ov_stride_for_dim > 0 && elem_size > 0) { + begin_val = static_cast((view_offset / elem_size) / ov_stride_for_dim); + } + int64_t end_val = begin_val + static_cast(dst_ov_shape[slice_dim]); + + if (begin_val < 0 || end_val > dim_size) { + return {input}; + } + + auto sliced = std::make_shared( + input, + ov::op::v0::Constant::create(ov::element::i64, {1}, {begin_val}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {end_val}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_dim})); + + sliced->set_friendly_name(context.get_output_name()); + return {sliced->output(0)}; } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index e0344aee3b8..3ca11989245 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -262,6 +263,100 @@ ov::Output process_view_input_new(const NodeContext & context, int inp return input; } + // If translate_view already resolved this VIEW (produced a Slice), the input + // will already have the expected shape — skip re-slicing. + auto expected_ov_shape = context.get_view_input_ov_shape(input_index, 0); + auto actual_shape = input.get_partial_shape(); + if (expected_ov_shape.rank().is_static() && actual_shape.rank().is_static() && + expected_ov_shape.rank() == actual_shape.rank()) { + bool shapes_match = true; + for (int64_t i = 0; i < expected_ov_shape.rank().get_length(); ++i) { + if (expected_ov_shape[i].is_static() && actual_shape[i].is_static() && + expected_ov_shape[i] != actual_shape[i]) { + shapes_match = false; + break; + } + } + if (shapes_match) { + return input; + } + } + + // In static mode, use Split instead of Slice for single-dimension reductions. + // This ensures NPUW's FOLD doesn't parametrize per-layer slice indices (which + // would introduce dynamic shapes). A shared Split node sits outside the repeated + // subgraph boundary; each layer receives one of its output ports. + if (context.is_static() && view_input_size == 1) { + auto view_stride_v = context.get_view_input_stride(input_index, 0); + auto view_src_stride_v = context.get_view_input_src_stride(input_index, 0); + auto view_ggml_shape = context.get_view_input_ggml_shape(input_index, 0); + auto view_src_ggml_shape = context.get_view_input_src_ggml_shape(input_index, 0); + auto view_offset = context.get_view_input_offset(input_index, 0); + auto view_src_offset = context.get_view_input_src_offset(input_index, 0); + + size_t ndims = view_ggml_shape.size(); + std::vector diff_dims; + if (view_src_ggml_shape.size() == ndims) { + for (size_t i = 0; i < ndims; ++i) { + if (view_ggml_shape[i] != view_src_ggml_shape[i]) { + diff_dims.push_back(static_cast(i)); + } + } + } + + if (diff_dims.size() == 1) { + int split_dim = diff_dims[0]; + int64_t num_splits = static_cast(view_src_ggml_shape[split_dim]); + int64_t chunk_size = static_cast(view_ggml_shape[split_dim]); + + // Only apply when slicing exactly 1 element from a multi-element dimension + if (chunk_size == 1 && num_splits > 1) { + // Check suffix strides match (dimensions after split_dim) + bool suffix_ok = view_stride_v.size() == view_src_stride_v.size(); + if (suffix_ok) { + for (size_t i = static_cast(split_dim) + 1; i < ndims; ++i) { + if (view_stride_v[i] != view_src_stride_v[i]) { + suffix_ok = false; + break; + } + } + } + + if (suffix_ok && view_src_stride_v[split_dim] > 0) { + size_t relative_offset = view_offset >= view_src_offset ? + view_offset - view_src_offset : 0; + int64_t split_index = static_cast( + relative_offset / view_src_stride_v[split_dim]); + + if (split_index >= 0 && split_index < num_splits) { + // TODO: avoid hardcoded name + std::string src_name = context.get_view_input_src_name(input_index, 0); + std::string cache_key = "__split__" + src_name + "__" + + std::to_string(split_dim) + "__"; + + auto cached = context.get_cached_tensor(cache_key + "0"); + if (cached.get_node_shared_ptr() == nullptr) { + auto axis_const = ov::op::v0::Constant::create( + ov::element::i64, {}, {static_cast(split_dim)}); + auto split_node = std::make_shared( + input, axis_const, static_cast(num_splits)); + split_node->set_friendly_name(src_name + "_split"); + + for (int64_t p = 0; p < num_splits; ++p) { + context.cache_tensor( + cache_key + std::to_string(p), + split_node->output(static_cast(p))); + } + } + + return context.get_cached_tensor( + cache_key + std::to_string(split_index)); + } + } + } + } + } + // Lambda function to process a single view operation auto process_single_view = [](ov::Output current, size_t view_offset, From 98093b1720df8682165d3e8225afeb101a430ec3 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Thu, 21 May 2026 16:04:57 -0700 Subject: [PATCH 2/5] temp. fix for gemma4 accuracy bug on npu --- ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp index d9fa4c24367..4124b6550b3 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -49,6 +50,16 @@ OutputVector translate_glu_geglu(const NodeContext & context) { std::swap(src0, src1); } + if (context.is_static()) { + // TODO: Temporary solution for NPU accuracy issue due to fp16 overflow + // To be removed once permanent solution is implemented + // Justification: + // For |x| > 5, GELU(x) ≈ max(x, 0) (behaves like ReLU) + // So Clamp(-10, 10) only affects values where GELU would return ≈ x anyway. + // The only loss: values > 10 get mapped to 10 instead of x. + // In practice, FFN intermediates rarely exceed 10 after GEGLU gating. + src0 = std::make_shared(src0, -10.0, 10.0); + } auto gelu = std::make_shared(src0); auto res = std::make_shared(gelu, src1); From 51ad7b6d7b73e5cd85830c63384326485eef2760 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Thu, 21 May 2026 16:19:31 -0700 Subject: [PATCH 3/5] Remove hardcoded names for npu-fold handling --- .../src/ggml-openvino/openvino/node_context.h | 12 ---------- ggml/src/ggml-openvino/openvino/utils.cpp | 23 +++++++------------ 2 files changed, 8 insertions(+), 27 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/node_context.h b/ggml/src/ggml-openvino/openvino/node_context.h index 8e834caa422..a34764dde6e 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.h +++ b/ggml/src/ggml-openvino/openvino/node_context.h @@ -143,18 +143,6 @@ class NodeContext : public frontend::NodeContext { return m_tensor_map->at(m_input_names[idx]); } - void cache_tensor(const std::string& name, const Output& tensor) const { - (*m_tensor_map)[name] = tensor; - } - - Output get_cached_tensor(const std::string& name) const { - auto it = m_tensor_map->find(name); - if (it != m_tensor_map->end()) { - return it->second; - } - return Output(); - } - Output get_input(const std::string& name) const override { if (m_tensor_map->find(name) == m_tensor_map->end()) { throw std::runtime_error("'" + name + "' not found in tensor map."); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index 3ca11989245..e8be8cfe790 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -329,28 +329,21 @@ ov::Output process_view_input_new(const NodeContext & context, int inp relative_offset / view_src_stride_v[split_dim]); if (split_index >= 0 && split_index < num_splits) { - // TODO: avoid hardcoded name - std::string src_name = context.get_view_input_src_name(input_index, 0); - std::string cache_key = "__split__" + src_name + "__" + - std::to_string(split_dim) + "__"; + auto src_node = input.get_node_shared_ptr(); + std::string rt_key = "split_dim_" + std::to_string(split_dim); + auto & rt_info = src_node->get_rt_info(); - auto cached = context.get_cached_tensor(cache_key + "0"); - if (cached.get_node_shared_ptr() == nullptr) { + if (rt_info.find(rt_key) == rt_info.end()) { auto axis_const = ov::op::v0::Constant::create( ov::element::i64, {}, {static_cast(split_dim)}); auto split_node = std::make_shared( input, axis_const, static_cast(num_splits)); - split_node->set_friendly_name(src_name + "_split"); - - for (int64_t p = 0; p < num_splits; ++p) { - context.cache_tensor( - cache_key + std::to_string(p), - split_node->output(static_cast(p))); - } + split_node->set_friendly_name(src_node->get_friendly_name() + "_split"); + rt_info[rt_key] = split_node; } - return context.get_cached_tensor( - cache_key + std::to_string(split_index)); + auto split_node = rt_info[rt_key].as>(); + return split_node->output(static_cast(split_index)); } } } From 0fa0534f40752f022f496f4533f872530e3edb86 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Thu, 21 May 2026 16:50:36 -0700 Subject: [PATCH 4/5] revert static n tokens for cont translation as it is not needed --- ggml/src/ggml-openvino/ggml-decoder.h | 2 +- ggml/src/ggml-openvino/openvino/decoder.h | 2 -- ggml/src/ggml-openvino/openvino/node_context.h | 2 -- ggml/src/ggml-openvino/openvino/op/cont.cpp | 2 +- 4 files changed, 2 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 35bed0ba476..d59180ce149 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -206,7 +206,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual bool is_stateful() const override { return m_is_stateful; } - virtual int get_static_n_tokens() const override { + int get_static_n_tokens() const { return m_is_prefill ? m_prefill_chunk_size : 1; } diff --git a/ggml/src/ggml-openvino/openvino/decoder.h b/ggml/src/ggml-openvino/openvino/decoder.h index c602aae73d7..bc41876875c 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.h +++ b/ggml/src/ggml-openvino/openvino/decoder.h @@ -101,8 +101,6 @@ class GgmlDecoder : public DecoderBase { virtual int is_swa_layer(int layer) const = 0; virtual int32_t get_op_dynamic_dim(int node_idx) const = 0; - - virtual int get_static_n_tokens() const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.h b/ggml/src/ggml-openvino/openvino/node_context.h index a34764dde6e..383ee8ac4ba 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.h +++ b/ggml/src/ggml-openvino/openvino/node_context.h @@ -170,8 +170,6 @@ class NodeContext : public frontend::NodeContext { bool is_stateful() const { return m_decoder->is_stateful(); } - int get_static_n_tokens() const { return m_decoder->get_static_n_tokens(); } - private: std::shared_ptr m_decoder; std::shared_ptr& m_tensor_map; diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index fed72cbfb93..1d6cc672126 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -22,7 +22,7 @@ OutputVector translate_cont(const NodeContext & context) { auto dst_shape = context.get_output_shape().to_shape(); if (context.get_op_dynamic_dim() != -1) { - dst_shape[3 - context.get_op_dynamic_dim()] = context.is_static() ? context.get_static_n_tokens() : -1; + dst_shape[3 - context.get_op_dynamic_dim()] = -1; } auto input = process_view_input_new(context, 0); From 59b8969add8d54839583a20ba121b992c63661e7 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Mon, 25 May 2026 08:17:18 -0700 Subject: [PATCH 5/5] removed unused variable --- ggml/src/ggml-openvino/openvino/op/view.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index 33ea8517c88..183d6bb7e58 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -77,9 +77,6 @@ OutputVector translate_view(const NodeContext & context) { int slice_dim = diff_dims[0]; int64_t dim_size = static_cast(src_ov_shape[slice_dim]); - size_t stride_at_dim = (slice_dim < static_cast(ndims) - 1) ? - src_stride[slice_dim + 1] : src_stride[slice_dim]; - size_t ov_stride_for_dim = 1; for (size_t i = slice_dim + 1; i < ndims; ++i) { ov_stride_for_dim *= src_ov_shape[i];