diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index d005b40458f..f5054641615 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -1131,7 +1131,7 @@ ov::PartialShape GgmlOvDecoder::get_view_input_ov_shape(int node_idx, const std: if (dynamic_it != m_node_dynamic_dims.end() && dynamic_it->second != -1) { int dynamic_dim_index = dynamic_it->second; // GGML uses reverse indexing, so convert to OpenVINO indexing - shape[3 - dynamic_dim_index] = -1; + shape[3 - dynamic_dim_index] = m_is_static ? get_static_n_tokens() : -1; } return shape; @@ -1154,7 +1154,7 @@ ov::PartialShape GgmlOvDecoder::get_view_input_src_ov_shape(int node_idx, const if (dynamic_it != m_node_dynamic_dims.end() && dynamic_it->second != -1) { int dynamic_dim_index = dynamic_it->second; // GGML uses reverse indexing, so convert to OpenVINO indexing - shape[3 - dynamic_dim_index] = -1; + shape[3 - dynamic_dim_index] = m_is_static ? get_static_n_tokens() : -1; } return shape; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 91850a000b5..d59180ce149 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -206,6 +206,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual bool is_stateful() const override { return m_is_stateful; } + int get_static_n_tokens() const { + return m_is_prefill ? m_prefill_chunk_size : 1; + } + virtual bool is_splited_model() const override { return m_model_is_splitted; } diff --git a/ggml/src/ggml-openvino/openvino/node_context.h b/ggml/src/ggml-openvino/openvino/node_context.h index 2402a74a908..383ee8ac4ba 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.h +++ b/ggml/src/ggml-openvino/openvino/node_context.h @@ -125,6 +125,16 @@ class NodeContext : public frontend::NodeContext { if (view_input_size > 0) { // This is a VIEW input, get the base tensor name (last element in the chain) std::string base_name = m_decoder->get_view_input_src_name(m_node_idx, m_input_names[idx], view_input_size - 1); + // Check if the VIEW has been resolved (translate_view produced a Slice) + auto view_it = m_tensor_map->find(m_input_names[idx]); + if (!base_name.empty() && view_it != m_tensor_map->end()) { + auto base_it = m_tensor_map->find(base_name); + if (base_it != m_tensor_map->end() && + view_it->second.get_node_shared_ptr() != base_it->second.get_node_shared_ptr()) { + return view_it->second; + } + return base_it->second; + } if (!base_name.empty()) { return m_tensor_map->at(base_name); } diff --git a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp index d9fa4c24367..4124b6550b3 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -49,6 +50,16 @@ OutputVector translate_glu_geglu(const NodeContext & context) { std::swap(src0, src1); } + if (context.is_static()) { + // TODO: Temporary solution for NPU accuracy issue due to fp16 overflow + // To be removed once permanent solution is implemented + // Justification: + // For |x| > 5, GELU(x) ≈ max(x, 0) (behaves like ReLU) + // So Clamp(-10, 10) only affects values where GELU would return ≈ x anyway. + // The only loss: values > 10 get mapped to 10 instead of x. + // In practice, FFN intermediates rarely exceed 10 after GEGLU gating. + src0 = std::make_shared(src0, -10.0, 10.0); + } auto gelu = std::make_shared(src0); auto res = std::make_shared(gelu, src1); diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index 7d777291939..183d6bb7e58 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -1,6 +1,8 @@ #include "../op_table.h" #include "../utils.h" +#include #include +#include #include namespace ov { namespace frontend { @@ -9,7 +11,98 @@ namespace op { OutputVector translate_view(const NodeContext & context) { num_inputs_check(context, 1, 1); - return {context.get_input(0)}; + + if (!context.is_static()) { + return {context.get_input(0)}; + } + + auto input = context.get_input(0); + auto src_shape = context.get_input_shape(0); + auto dst_shape = context.get_output_shape(); + + if (src_shape.rank().is_dynamic() || dst_shape.rank().is_dynamic()) { + return {input}; + } + + int64_t src_elems = 1, dst_elems = 1; + for (int64_t i = 0; i < src_shape.rank().get_length(); ++i) { + if (src_shape[i].is_dynamic()) return {input}; + src_elems *= src_shape[i].get_length(); + } + for (int64_t i = 0; i < dst_shape.rank().get_length(); ++i) { + if (dst_shape[i].is_dynamic()) return {input}; + dst_elems *= dst_shape[i].get_length(); + } + + if (dst_elems >= src_elems) { + return {input}; + } + + auto src_stride = context.get_input_stride(0); + auto dst_stride = context.get_output_stride(); + size_t view_offset = context.get_output_op_offset(); + + bool same_stride = (src_stride.size() == dst_stride.size()); + if (same_stride) { + for (size_t i = 0; i < src_stride.size(); ++i) { + if (src_stride[i] != dst_stride[i]) { + same_stride = false; + break; + } + } + } + + if (!same_stride) { + return {input}; + } + + auto src_ov_shape = src_shape.to_shape(); + auto dst_ov_shape = dst_shape.to_shape(); + size_t ndims = src_ov_shape.size(); + if (dst_ov_shape.size() != ndims) { + return {input}; + } + + std::vector diff_dims; + for (size_t i = 0; i < ndims; ++i) { + if (src_ov_shape[i] != dst_ov_shape[i]) { + diff_dims.push_back(static_cast(i)); + } + } + + if (diff_dims.size() != 1) { + return {input}; + } + + int slice_dim = diff_dims[0]; + int64_t dim_size = static_cast(src_ov_shape[slice_dim]); + + size_t ov_stride_for_dim = 1; + for (size_t i = slice_dim + 1; i < ndims; ++i) { + ov_stride_for_dim *= src_ov_shape[i]; + } + size_t elem_size = src_stride.back(); + if (elem_size == 0) elem_size = 1; + + int64_t begin_val = 0; + if (ov_stride_for_dim > 0 && elem_size > 0) { + begin_val = static_cast((view_offset / elem_size) / ov_stride_for_dim); + } + int64_t end_val = begin_val + static_cast(dst_ov_shape[slice_dim]); + + if (begin_val < 0 || end_val > dim_size) { + return {input}; + } + + auto sliced = std::make_shared( + input, + ov::op::v0::Constant::create(ov::element::i64, {1}, {begin_val}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {end_val}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_dim})); + + sliced->set_friendly_name(context.get_output_name()); + return {sliced->output(0)}; } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index e0344aee3b8..e8be8cfe790 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -262,6 +263,93 @@ ov::Output process_view_input_new(const NodeContext & context, int inp return input; } + // If translate_view already resolved this VIEW (produced a Slice), the input + // will already have the expected shape — skip re-slicing. + auto expected_ov_shape = context.get_view_input_ov_shape(input_index, 0); + auto actual_shape = input.get_partial_shape(); + if (expected_ov_shape.rank().is_static() && actual_shape.rank().is_static() && + expected_ov_shape.rank() == actual_shape.rank()) { + bool shapes_match = true; + for (int64_t i = 0; i < expected_ov_shape.rank().get_length(); ++i) { + if (expected_ov_shape[i].is_static() && actual_shape[i].is_static() && + expected_ov_shape[i] != actual_shape[i]) { + shapes_match = false; + break; + } + } + if (shapes_match) { + return input; + } + } + + // In static mode, use Split instead of Slice for single-dimension reductions. + // This ensures NPUW's FOLD doesn't parametrize per-layer slice indices (which + // would introduce dynamic shapes). A shared Split node sits outside the repeated + // subgraph boundary; each layer receives one of its output ports. + if (context.is_static() && view_input_size == 1) { + auto view_stride_v = context.get_view_input_stride(input_index, 0); + auto view_src_stride_v = context.get_view_input_src_stride(input_index, 0); + auto view_ggml_shape = context.get_view_input_ggml_shape(input_index, 0); + auto view_src_ggml_shape = context.get_view_input_src_ggml_shape(input_index, 0); + auto view_offset = context.get_view_input_offset(input_index, 0); + auto view_src_offset = context.get_view_input_src_offset(input_index, 0); + + size_t ndims = view_ggml_shape.size(); + std::vector diff_dims; + if (view_src_ggml_shape.size() == ndims) { + for (size_t i = 0; i < ndims; ++i) { + if (view_ggml_shape[i] != view_src_ggml_shape[i]) { + diff_dims.push_back(static_cast(i)); + } + } + } + + if (diff_dims.size() == 1) { + int split_dim = diff_dims[0]; + int64_t num_splits = static_cast(view_src_ggml_shape[split_dim]); + int64_t chunk_size = static_cast(view_ggml_shape[split_dim]); + + // Only apply when slicing exactly 1 element from a multi-element dimension + if (chunk_size == 1 && num_splits > 1) { + // Check suffix strides match (dimensions after split_dim) + bool suffix_ok = view_stride_v.size() == view_src_stride_v.size(); + if (suffix_ok) { + for (size_t i = static_cast(split_dim) + 1; i < ndims; ++i) { + if (view_stride_v[i] != view_src_stride_v[i]) { + suffix_ok = false; + break; + } + } + } + + if (suffix_ok && view_src_stride_v[split_dim] > 0) { + size_t relative_offset = view_offset >= view_src_offset ? + view_offset - view_src_offset : 0; + int64_t split_index = static_cast( + relative_offset / view_src_stride_v[split_dim]); + + if (split_index >= 0 && split_index < num_splits) { + auto src_node = input.get_node_shared_ptr(); + std::string rt_key = "split_dim_" + std::to_string(split_dim); + auto & rt_info = src_node->get_rt_info(); + + if (rt_info.find(rt_key) == rt_info.end()) { + auto axis_const = ov::op::v0::Constant::create( + ov::element::i64, {}, {static_cast(split_dim)}); + auto split_node = std::make_shared( + input, axis_const, static_cast(num_splits)); + split_node->set_friendly_name(src_node->get_friendly_name() + "_split"); + rt_info[rt_key] = split_node; + } + + auto split_node = rt_info[rt_key].as>(); + return split_node->output(static_cast(split_index)); + } + } + } + } + } + // Lambda function to process a single view operation auto process_single_view = [](ov::Output current, size_t view_offset,