ravi9 · cavusmustafa · May 21, 2026 · May 21, 2026 · May 21, 2026 · May 21, 2026
@@ -1131,7 +1131,7 @@ ov::PartialShape GgmlOvDecoder::get_view_input_ov_shape(int node_idx, const std:
             if (dynamic_it != m_node_dynamic_dims.end() && dynamic_it->second != -1) {
                 int dynamic_dim_index = dynamic_it->second;
                 // GGML uses reverse indexing, so convert to OpenVINO indexing
-                shape[3 - dynamic_dim_index] = -1;
+                shape[3 - dynamic_dim_index] = m_is_static ? get_static_n_tokens() : -1;
             }
 
             return shape;
@@ -1154,7 +1154,7 @@ ov::PartialShape GgmlOvDecoder::get_view_input_src_ov_shape(int node_idx, const
                 if (dynamic_it != m_node_dynamic_dims.end() && dynamic_it->second != -1) {
                     int dynamic_dim_index = dynamic_it->second;
                     // GGML uses reverse indexing, so convert to OpenVINO indexing
-                    shape[3 - dynamic_dim_index] = -1;
+                    shape[3 - dynamic_dim_index] = m_is_static ? get_static_n_tokens() : -1;
                 }
 
                 return shape;

@@ -206,6 +206,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     virtual bool is_stateful() const override { return m_is_stateful; }
 
+    int get_static_n_tokens() const {
+        return m_is_prefill ? m_prefill_chunk_size : 1;
+    }
+
     virtual bool is_splited_model() const override {
         return m_model_is_splitted;
     }

@@ -125,6 +125,16 @@ class NodeContext : public frontend::NodeContext {
         if (view_input_size > 0) {
             // This is a VIEW input, get the base tensor name (last element in the chain)
             std::string base_name = m_decoder->get_view_input_src_name(m_node_idx, m_input_names[idx], view_input_size - 1);
+            // Check if the VIEW has been resolved (translate_view produced a Slice)
+            auto view_it = m_tensor_map->find(m_input_names[idx]);
+            if (!base_name.empty() && view_it != m_tensor_map->end()) {
+                auto base_it = m_tensor_map->find(base_name);
+                if (base_it != m_tensor_map->end() &&
+                    view_it->second.get_node_shared_ptr() != base_it->second.get_node_shared_ptr()) {
+                    return view_it->second;
+                }
+                return base_it->second;
+            }
             if (!base_name.empty()) {
                 return m_tensor_map->at(base_name);
             }

@@ -4,6 +4,7 @@
 
 #include <memory>
 #include <openvino/core/node_output.hpp>
+#include <openvino/op/clamp.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/gelu.hpp>
 #include <openvino/op/multiply.hpp>
@@ -49,6 +50,16 @@ OutputVector translate_glu_geglu(const NodeContext & context) {
         std::swap(src0, src1);
     }
 
+    if (context.is_static()) {
+        // TODO: Temporary solution for NPU accuracy issue due to fp16 overflow
+       // To be removed once permanent solution is implemented
+       // Justification:
+        // For |x| > 5, GELU(x) ≈ max(x, 0)  (behaves like ReLU)
+        // So Clamp(-10, 10) only affects values where GELU would return ≈ x anyway.
+        // The only loss: values > 10 get mapped to 10 instead of x.
+        // In practice, FFN intermediates rarely exceed 10 after GEGLU gating.
+        src0 = std::make_shared<ov::op::v0::Clamp>(src0, -10.0, 10.0);
+    }
     auto gelu = std::make_shared<ov::op::v7::Gelu>(src0);
     auto res = std::make_shared<ov::op::v1::Multiply>(gelu, src1);
 

@@ -1,6 +1,8 @@
 #include "../op_table.h"
 #include "../utils.h"
+#include <openvino/op/constant.hpp>
 #include <openvino/op/reshape.hpp>
+#include <openvino/op/slice.hpp>
 #include <set>
 namespace ov {
 namespace frontend {
@@ -9,7 +11,98 @@ namespace op {
 
 OutputVector translate_view(const NodeContext & context) {
     num_inputs_check(context, 1, 1);
-    return {context.get_input(0)};
+
+    if (!context.is_static()) {
+        return {context.get_input(0)};
+    }
+
+    auto input = context.get_input(0);
+    auto src_shape = context.get_input_shape(0);
+    auto dst_shape = context.get_output_shape();
+
+    if (src_shape.rank().is_dynamic() || dst_shape.rank().is_dynamic()) {
+        return {input};
+    }
+
+    int64_t src_elems = 1, dst_elems = 1;
+    for (int64_t i = 0; i < src_shape.rank().get_length(); ++i) {
+        if (src_shape[i].is_dynamic()) return {input};
+        src_elems *= src_shape[i].get_length();
+    }
+    for (int64_t i = 0; i < dst_shape.rank().get_length(); ++i) {
+        if (dst_shape[i].is_dynamic()) return {input};
+        dst_elems *= dst_shape[i].get_length();
+    }
+
+    if (dst_elems >= src_elems) {
+        return {input};
+    }
+
+    auto src_stride = context.get_input_stride(0);
+    auto dst_stride = context.get_output_stride();
+    size_t view_offset = context.get_output_op_offset();
+
+    bool same_stride = (src_stride.size() == dst_stride.size());
+    if (same_stride) {
+        for (size_t i = 0; i < src_stride.size(); ++i) {
+            if (src_stride[i] != dst_stride[i]) {
+                same_stride = false;
+                break;
+            }
+        }
+    }
+
+    if (!same_stride) {
+        return {input};
+    }
+
+    auto src_ov_shape = src_shape.to_shape();
+    auto dst_ov_shape = dst_shape.to_shape();
+    size_t ndims = src_ov_shape.size();
+    if (dst_ov_shape.size() != ndims) {
+        return {input};
+    }
+
+    std::vector<int> diff_dims;
+    for (size_t i = 0; i < ndims; ++i) {
+        if (src_ov_shape[i] != dst_ov_shape[i]) {
+            diff_dims.push_back(static_cast<int>(i));
+        }
+    }
+
+    if (diff_dims.size() != 1) {
+        return {input};
+    }
+
+    int slice_dim = diff_dims[0];
+    int64_t dim_size = static_cast<int64_t>(src_ov_shape[slice_dim]);
+
+    size_t ov_stride_for_dim = 1;
+    for (size_t i = slice_dim + 1; i < ndims; ++i) {
+        ov_stride_for_dim *= src_ov_shape[i];
+    }
+    size_t elem_size = src_stride.back();
+    if (elem_size == 0) elem_size = 1;
+
+    int64_t begin_val = 0;
+    if (ov_stride_for_dim > 0 && elem_size > 0) {
+        begin_val = static_cast<int64_t>((view_offset / elem_size) / ov_stride_for_dim);
+    }
+    int64_t end_val = begin_val + static_cast<int64_t>(dst_ov_shape[slice_dim]);
+
+    if (begin_val < 0 || end_val > dim_size) {
+        return {input};
+    }
+
+    auto sliced = std::make_shared<ov::op::v8::Slice>(
+        input,
+        ov::op::v0::Constant::create(ov::element::i64, {1}, {begin_val}),
+        ov::op::v0::Constant::create(ov::element::i64, {1}, {end_val}),
+        ov::op::v0::Constant::create(ov::element::i64, {1}, {1}),
+        ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_dim}));
+
+    sliced->set_friendly_name(context.get_output_name());
+    return {sliced->output(0)};
 }
 
 }  // namespace op

@@ -17,6 +17,7 @@
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/shape_of.hpp>
 #include <openvino/op/sin.hpp>
+#include <openvino/op/split.hpp>
 #include <openvino/op/squeeze.hpp>
 #include <openvino/op/subtract.hpp>
 #include <openvino/op/transpose.hpp>
@@ -262,6 +263,93 @@ ov::Output<ov::Node> process_view_input_new(const NodeContext & context, int inp
         return input;
     }
 
+    // If translate_view already resolved this VIEW (produced a Slice), the input
+    // will already have the expected shape — skip re-slicing.
+    auto expected_ov_shape = context.get_view_input_ov_shape(input_index, 0);
+    auto actual_shape = input.get_partial_shape();
+    if (expected_ov_shape.rank().is_static() && actual_shape.rank().is_static() &&
+        expected_ov_shape.rank() == actual_shape.rank()) {
+        bool shapes_match = true;
+        for (int64_t i = 0; i < expected_ov_shape.rank().get_length(); ++i) {
+            if (expected_ov_shape[i].is_static() && actual_shape[i].is_static() &&
+                expected_ov_shape[i] != actual_shape[i]) {
+                shapes_match = false;
+                break;
+            }
+        }
+        if (shapes_match) {
+            return input;
+        }
+    }
+
+    // In static mode, use Split instead of Slice for single-dimension reductions.
+    // This ensures NPUW's FOLD doesn't parametrize per-layer slice indices (which
+    // would introduce dynamic shapes). A shared Split node sits outside the repeated
+    // subgraph boundary; each layer receives one of its output ports.
+    if (context.is_static() && view_input_size == 1) {
+        auto view_stride_v = context.get_view_input_stride(input_index, 0);
+        auto view_src_stride_v = context.get_view_input_src_stride(input_index, 0);
+        auto view_ggml_shape = context.get_view_input_ggml_shape(input_index, 0);
+        auto view_src_ggml_shape = context.get_view_input_src_ggml_shape(input_index, 0);
+        auto view_offset = context.get_view_input_offset(input_index, 0);
+        auto view_src_offset = context.get_view_input_src_offset(input_index, 0);
+
+        size_t ndims = view_ggml_shape.size();
+        std::vector<int> diff_dims;
+        if (view_src_ggml_shape.size() == ndims) {
+            for (size_t i = 0; i < ndims; ++i) {
+                if (view_ggml_shape[i] != view_src_ggml_shape[i]) {
+                    diff_dims.push_back(static_cast<int>(i));
+                }
+            }
+        }
+
+        if (diff_dims.size() == 1) {
+            int split_dim = diff_dims[0];
+            int64_t num_splits = static_cast<int64_t>(view_src_ggml_shape[split_dim]);
+            int64_t chunk_size = static_cast<int64_t>(view_ggml_shape[split_dim]);
+
+            // Only apply when slicing exactly 1 element from a multi-element dimension
+            if (chunk_size == 1 && num_splits > 1) {
+                // Check suffix strides match (dimensions after split_dim)
+                bool suffix_ok = view_stride_v.size() == view_src_stride_v.size();
+                if (suffix_ok) {
+                    for (size_t i = static_cast<size_t>(split_dim) + 1; i < ndims; ++i) {
+                        if (view_stride_v[i] != view_src_stride_v[i]) {
+                            suffix_ok = false;
+                            break;
+                        }
+                    }
+                }
+
+                if (suffix_ok && view_src_stride_v[split_dim] > 0) {
+                    size_t relative_offset = view_offset >= view_src_offset ?
+                        view_offset - view_src_offset : 0;
+                    int64_t split_index = static_cast<int64_t>(
+                        relative_offset / view_src_stride_v[split_dim]);
+
+                    if (split_index >= 0 && split_index < num_splits) {
+                        auto src_node = input.get_node_shared_ptr();
+                        std::string rt_key = "split_dim_" + std::to_string(split_dim);
+                        auto & rt_info = src_node->get_rt_info();
+
+                        if (rt_info.find(rt_key) == rt_info.end()) {
+                            auto axis_const = ov::op::v0::Constant::create(
+                                ov::element::i64, {}, {static_cast<int64_t>(split_dim)});
+                            auto split_node = std::make_shared<ov::op::v1::Split>(
+                                input, axis_const, static_cast<size_t>(num_splits));
+                            split_node->set_friendly_name(src_node->get_friendly_name() + "_split");
+                            rt_info[rt_key] = split_node;
+                        }
+
+                        auto split_node = rt_info[rt_key].as<std::shared_ptr<ov::op::v1::Split>>();
+                        return split_node->output(static_cast<size_t>(split_index));
+                    }
+                }
+            }
+        }
+    }
+
     // Lambda function to process a single view operation
     auto process_single_view = [](ov::Output<ov::Node> current,
                                   size_t view_offset,