diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 41566d41aef..e209d063b13 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -694,7 +694,9 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[8]; + char padding[16]; + // add a struct ggml_tensor * named org_src, initialized to NULL, for keeping track of original source tensors in case of in-place operations + struct ggml_tensor * org_src; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 4e36909f45e..aca69841cde 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1215,8 +1215,11 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra struct ggml_tensor * node = graph->nodes[i]; int * cur_backend_id = &tensor_backend_id(node); if (node->view_src != NULL && *cur_backend_id == -1) { - *cur_backend_id = tensor_backend_id(node->view_src); - SET_CAUSE(node, "4.vsrc"); + auto view_src_backend = tensor_backend_id(node->view_src); + if (view_src_backend != -1 && ggml_backend_supports_op(sched->backends[view_src_backend], node)) { + *cur_backend_id = tensor_backend_id(node->view_src); + SET_CAUSE(node, "4.vsrc"); + } } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; @@ -1242,6 +1245,14 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra GGML_ASSERT(*cur_backend_id != -1); } + // add the node id to the name for easier debugging + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + char new_name[128]; + snprintf(new_name, sizeof(new_name), "%s#%d", node->name, i); + ggml_format_name(node, "%s", new_name); + } + // pass 5: split graph, find tensors that need to be copied { int i_split = 0; @@ -1262,7 +1273,9 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { - continue; + if ((tensor_backend_id(node) != cur_backend_id) && (ggml_backend_supports_op(sched->backends[cur_backend_id], node))) { + tensor_backend_id(node) = cur_backend_id; + } } const int node_backend_id = tensor_backend_id(node); @@ -1360,6 +1373,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra ggml_set_input(tensor_copy); ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } + tensor_copy->org_src = src; tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 91e652a0405..ba987f50280 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -1359,9 +1359,9 @@ void GgmlOvDecoder::compute_node_dynamic_dims() { continue; } struct ggml_tensor *root_src = nullptr; - // if (src->org_src) { - // root_src = src->org_src; - // } + if (src->org_src) { + root_src = src->org_src; + } if (root_src) { if (is_inp_tok(root_src, node) || is_inp_pos(root_src, node) || is_output_idx(root_src, node)) { @@ -1440,7 +1440,7 @@ void GgmlOvDecoder::compute_node_dynamic_dims() { // identifies the dynamic dim even when two dims share the same size. m_node_dynamic_dims[node] = -1; if (m_node_dynamic_dims[node->src[0]] != -1) { - if (node->src[0]->op == GGML_OP_NONE) { + if (node->src[0]->op == GGML_OP_NONE && node->src[0]->org_src == nullptr) { m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]]; break; } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 91850a000b5..7a30ad7afe7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -251,7 +251,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void update_io(ggml_cgraph * cgraph); inline static bool is_inp_tok(const ggml_tensor * tensor, const ggml_tensor * op) { - return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE; + return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE && op->src[0]->org_src == nullptr; } inline static bool is_inp_pos(const ggml_tensor * tensor, const ggml_tensor * op) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 476c3079795..853b01c2143 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1782,6 +1782,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.name =*/ { 0 }, /*.extra =*/ NULL, /*.padding =*/ { 0 }, + /*.org_src =*/ NULL, }; // TODO: this should not be needed as long as we don't rely on aligned SIMD loads diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index 16af11a2862..415a9fa2e05 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -499,12 +499,17 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg std::vector dev_configs; { std::vector devices_meta; + bool has_openvino = false; { const size_t device_count = ggml_backend_dev_count(); for (size_t i = 0; i < device_count; i++) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); dev_configs.emplace_back(std::vector{dev}, ggml_backend_dev_description(dev), LLAMA_SPLIT_MODE_LAYER); + if (strncmp(ggml_backend_dev_name(dev), "OPENVINO", 8) == 0) { + has_openvino = true; + } + // cpu-based devices cannot be used in tensor split mode if (ggml_backend_dev_buffer_type(dev) != ggml_backend_cpu_buffer_type()) { devices_meta.push_back(dev); @@ -512,7 +517,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg } } - dev_configs.emplace_back(devices_meta, "Meta", LLAMA_SPLIT_MODE_TENSOR); + if (!has_openvino) { + dev_configs.emplace_back(devices_meta, "Meta", LLAMA_SPLIT_MODE_TENSOR); + } } bool all_ok = true;