Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ggml/src/ggml-openvino/ggml-decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1131,7 +1131,7 @@ ov::PartialShape GgmlOvDecoder::get_view_input_ov_shape(int node_idx, const std:
if (dynamic_it != m_node_dynamic_dims.end() && dynamic_it->second != -1) {
int dynamic_dim_index = dynamic_it->second;
// GGML uses reverse indexing, so convert to OpenVINO indexing
shape[3 - dynamic_dim_index] = -1;
shape[3 - dynamic_dim_index] = m_is_static ? get_static_n_tokens() : -1;
}

return shape;
Expand All @@ -1154,7 +1154,7 @@ ov::PartialShape GgmlOvDecoder::get_view_input_src_ov_shape(int node_idx, const
if (dynamic_it != m_node_dynamic_dims.end() && dynamic_it->second != -1) {
int dynamic_dim_index = dynamic_it->second;
// GGML uses reverse indexing, so convert to OpenVINO indexing
shape[3 - dynamic_dim_index] = -1;
shape[3 - dynamic_dim_index] = m_is_static ? get_static_n_tokens() : -1;
}

return shape;
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-openvino/ggml-decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

virtual bool is_stateful() const override { return m_is_stateful; }

int get_static_n_tokens() const {
return m_is_prefill ? m_prefill_chunk_size : 1;
}

virtual bool is_splited_model() const override {
return m_model_is_splitted;
}
Expand Down
10 changes: 10 additions & 0 deletions ggml/src/ggml-openvino/openvino/node_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,16 @@ class NodeContext : public frontend::NodeContext {
if (view_input_size > 0) {
// This is a VIEW input, get the base tensor name (last element in the chain)
std::string base_name = m_decoder->get_view_input_src_name(m_node_idx, m_input_names[idx], view_input_size - 1);
// Check if the VIEW has been resolved (translate_view produced a Slice)
auto view_it = m_tensor_map->find(m_input_names[idx]);
if (!base_name.empty() && view_it != m_tensor_map->end()) {
auto base_it = m_tensor_map->find(base_name);
if (base_it != m_tensor_map->end() &&
view_it->second.get_node_shared_ptr() != base_it->second.get_node_shared_ptr()) {
return view_it->second;
}
return base_it->second;
}
if (!base_name.empty()) {
return m_tensor_map->at(base_name);
}
Expand Down
11 changes: 11 additions & 0 deletions ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <memory>
#include <openvino/core/node_output.hpp>
#include <openvino/op/clamp.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/gelu.hpp>
#include <openvino/op/multiply.hpp>
Expand Down Expand Up @@ -49,6 +50,16 @@ OutputVector translate_glu_geglu(const NodeContext & context) {
std::swap(src0, src1);
}

if (context.is_static()) {
// TODO: Temporary solution for NPU accuracy issue due to fp16 overflow
// To be removed once permanent solution is implemented
// Justification:
// For |x| > 5, GELU(x) ≈ max(x, 0) (behaves like ReLU)
// So Clamp(-10, 10) only affects values where GELU would return ≈ x anyway.
// The only loss: values > 10 get mapped to 10 instead of x.
// In practice, FFN intermediates rarely exceed 10 after GEGLU gating.
src0 = std::make_shared<ov::op::v0::Clamp>(src0, -10.0, 10.0);
}
auto gelu = std::make_shared<ov::op::v7::Gelu>(src0);
auto res = std::make_shared<ov::op::v1::Multiply>(gelu, src1);

Expand Down
95 changes: 94 additions & 1 deletion ggml/src/ggml-openvino/openvino/op/view.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include "../op_table.h"
#include "../utils.h"
#include <openvino/op/constant.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/slice.hpp>
#include <set>
namespace ov {
namespace frontend {
Expand All @@ -9,7 +11,98 @@ namespace op {

OutputVector translate_view(const NodeContext & context) {
num_inputs_check(context, 1, 1);
return {context.get_input(0)};

if (!context.is_static()) {
return {context.get_input(0)};
}

auto input = context.get_input(0);
auto src_shape = context.get_input_shape(0);
auto dst_shape = context.get_output_shape();

if (src_shape.rank().is_dynamic() || dst_shape.rank().is_dynamic()) {
return {input};
}

int64_t src_elems = 1, dst_elems = 1;
for (int64_t i = 0; i < src_shape.rank().get_length(); ++i) {
if (src_shape[i].is_dynamic()) return {input};
src_elems *= src_shape[i].get_length();
}
for (int64_t i = 0; i < dst_shape.rank().get_length(); ++i) {
if (dst_shape[i].is_dynamic()) return {input};
dst_elems *= dst_shape[i].get_length();
}

if (dst_elems >= src_elems) {
return {input};
}

auto src_stride = context.get_input_stride(0);
auto dst_stride = context.get_output_stride();
size_t view_offset = context.get_output_op_offset();

bool same_stride = (src_stride.size() == dst_stride.size());
if (same_stride) {
for (size_t i = 0; i < src_stride.size(); ++i) {
if (src_stride[i] != dst_stride[i]) {
same_stride = false;
break;
}
}
}

if (!same_stride) {
return {input};
}

auto src_ov_shape = src_shape.to_shape();
auto dst_ov_shape = dst_shape.to_shape();
size_t ndims = src_ov_shape.size();
if (dst_ov_shape.size() != ndims) {
return {input};
}

std::vector<int> diff_dims;
for (size_t i = 0; i < ndims; ++i) {
if (src_ov_shape[i] != dst_ov_shape[i]) {
diff_dims.push_back(static_cast<int>(i));
}
}

if (diff_dims.size() != 1) {
return {input};
}

int slice_dim = diff_dims[0];
int64_t dim_size = static_cast<int64_t>(src_ov_shape[slice_dim]);

size_t ov_stride_for_dim = 1;
for (size_t i = slice_dim + 1; i < ndims; ++i) {
ov_stride_for_dim *= src_ov_shape[i];
}
size_t elem_size = src_stride.back();
if (elem_size == 0) elem_size = 1;

int64_t begin_val = 0;
if (ov_stride_for_dim > 0 && elem_size > 0) {
begin_val = static_cast<int64_t>((view_offset / elem_size) / ov_stride_for_dim);
}
int64_t end_val = begin_val + static_cast<int64_t>(dst_ov_shape[slice_dim]);

if (begin_val < 0 || end_val > dim_size) {
return {input};
}

auto sliced = std::make_shared<ov::op::v8::Slice>(
input,
ov::op::v0::Constant::create(ov::element::i64, {1}, {begin_val}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {end_val}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {1}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_dim}));

sliced->set_friendly_name(context.get_output_name());
return {sliced->output(0)};
}

} // namespace op
Expand Down
88 changes: 88 additions & 0 deletions ggml/src/ggml-openvino/openvino/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <openvino/op/reshape.hpp>
#include <openvino/op/shape_of.hpp>
#include <openvino/op/sin.hpp>
#include <openvino/op/split.hpp>
#include <openvino/op/squeeze.hpp>
#include <openvino/op/subtract.hpp>
#include <openvino/op/transpose.hpp>
Expand Down Expand Up @@ -262,6 +263,93 @@ ov::Output<ov::Node> process_view_input_new(const NodeContext & context, int inp
return input;
}

// If translate_view already resolved this VIEW (produced a Slice), the input
// will already have the expected shape — skip re-slicing.
auto expected_ov_shape = context.get_view_input_ov_shape(input_index, 0);
auto actual_shape = input.get_partial_shape();
if (expected_ov_shape.rank().is_static() && actual_shape.rank().is_static() &&
expected_ov_shape.rank() == actual_shape.rank()) {
bool shapes_match = true;
for (int64_t i = 0; i < expected_ov_shape.rank().get_length(); ++i) {
if (expected_ov_shape[i].is_static() && actual_shape[i].is_static() &&
expected_ov_shape[i] != actual_shape[i]) {
shapes_match = false;
break;
}
}
if (shapes_match) {
return input;
}
}

// In static mode, use Split instead of Slice for single-dimension reductions.
// This ensures NPUW's FOLD doesn't parametrize per-layer slice indices (which
// would introduce dynamic shapes). A shared Split node sits outside the repeated
// subgraph boundary; each layer receives one of its output ports.
if (context.is_static() && view_input_size == 1) {
auto view_stride_v = context.get_view_input_stride(input_index, 0);
auto view_src_stride_v = context.get_view_input_src_stride(input_index, 0);
auto view_ggml_shape = context.get_view_input_ggml_shape(input_index, 0);
auto view_src_ggml_shape = context.get_view_input_src_ggml_shape(input_index, 0);
auto view_offset = context.get_view_input_offset(input_index, 0);
auto view_src_offset = context.get_view_input_src_offset(input_index, 0);

size_t ndims = view_ggml_shape.size();
std::vector<int> diff_dims;
if (view_src_ggml_shape.size() == ndims) {
for (size_t i = 0; i < ndims; ++i) {
if (view_ggml_shape[i] != view_src_ggml_shape[i]) {
diff_dims.push_back(static_cast<int>(i));
}
}
}

if (diff_dims.size() == 1) {
int split_dim = diff_dims[0];
int64_t num_splits = static_cast<int64_t>(view_src_ggml_shape[split_dim]);
int64_t chunk_size = static_cast<int64_t>(view_ggml_shape[split_dim]);

// Only apply when slicing exactly 1 element from a multi-element dimension
if (chunk_size == 1 && num_splits > 1) {
// Check suffix strides match (dimensions after split_dim)
bool suffix_ok = view_stride_v.size() == view_src_stride_v.size();
if (suffix_ok) {
for (size_t i = static_cast<size_t>(split_dim) + 1; i < ndims; ++i) {
if (view_stride_v[i] != view_src_stride_v[i]) {
suffix_ok = false;
break;
}
}
}

if (suffix_ok && view_src_stride_v[split_dim] > 0) {
size_t relative_offset = view_offset >= view_src_offset ?
view_offset - view_src_offset : 0;
int64_t split_index = static_cast<int64_t>(
relative_offset / view_src_stride_v[split_dim]);

if (split_index >= 0 && split_index < num_splits) {
auto src_node = input.get_node_shared_ptr();
std::string rt_key = "split_dim_" + std::to_string(split_dim);
auto & rt_info = src_node->get_rt_info();

if (rt_info.find(rt_key) == rt_info.end()) {
auto axis_const = ov::op::v0::Constant::create(
ov::element::i64, {}, {static_cast<int64_t>(split_dim)});
auto split_node = std::make_shared<ov::op::v1::Split>(
input, axis_const, static_cast<size_t>(num_splits));
split_node->set_friendly_name(src_node->get_friendly_name() + "_split");
rt_info[rt_key] = split_node;
}

auto split_node = rt_info[rt_key].as<std::shared_ptr<ov::op::v1::Split>>();
return split_node->output(static_cast<size_t>(split_index));
}
}
}
}
}

// Lambda function to process a single view operation
auto process_single_view = [](ov::Output<ov::Node> current,
size_t view_offset,
Expand Down
Loading