From 1e112ac81c953371ed8e8c686c077c0a5aa2a87f Mon Sep 17 00:00:00 2001 From: Niupple Date: Wed, 25 Nov 2020 05:08:23 +0000 Subject: [PATCH 01/32] refactor and add some fp16 --- .../core/operators/generic_op/generic_op.hpp | 17 ++- .../frontend/tensorflow_import/ops/const.cpp | 128 +++++++----------- .../frontend/tensorflow_import/ops/const.hpp | 2 +- .../tensorflow_import/util/graph_convert.cpp | 2 +- .../frontend/tensorflow_import/util/util.cpp | 1 + 5 files changed, 66 insertions(+), 84 deletions(-) diff --git a/src/nnfusion/core/operators/generic_op/generic_op.hpp b/src/nnfusion/core/operators/generic_op/generic_op.hpp index fa500a93a..831457e32 100644 --- a/src/nnfusion/core/operators/generic_op/generic_op.hpp +++ b/src/nnfusion/core/operators/generic_op/generic_op.hpp @@ -5,7 +5,9 @@ #include #include + #include "nnfusion/common/common.hpp" +#include "ngraph/src/nnfusion/common/type/element_type.hpp" #define REGISTER_OP(op_x) \ static nnfusion::op::OpConfig __register_op_##op_x = nnfusion::op::build_op_config(#op_x) @@ -200,23 +202,26 @@ namespace nnfusion { alias_name = alias_name.empty() ? input_name : alias_name; config[alias_name] = input_name; - auto d_type = tensor->get_element_type().c_type_string(); - if (d_type == "float") + auto d_type = tensor->get_element_type(); + if (d_type == element::f32) { config[alias_name + "_dtype"] = "float32"; } - else if (d_type == "int32_t") + else if (d_type == element::i32) { config[alias_name + "_dtype"] = "int32"; } - else if (d_type == "int64_t") + else if (d_type == element::i64) { config[alias_name + "_dtype"] = "int64"; } + else if (d_type == element::f16) + { + config[alias_name + "_dtype"] = "float16"; + } else { - printf("Unhandled type: %s\n", d_type.c_str()); - assert(0); + NNFUSION_CHECK_FAIL_WITH_EXCEPTION() << "Unhandled type: " << d_type.c_str(); } auto shape = tensor->get_shape(); if (shape.size() == 0) diff --git a/src/nnfusion/frontend/tensorflow_import/ops/const.cpp b/src/nnfusion/frontend/tensorflow_import/ops/const.cpp index b936c0b10..9dba39c21 100644 --- a/src/nnfusion/frontend/tensorflow_import/ops/const.cpp +++ b/src/nnfusion/frontend/tensorflow_import/ops/const.cpp @@ -167,72 +167,46 @@ namespace nnfusion // int_val, float_val, etc. if (tensor_content_size == 0) { + +#define GET_VALUES(type) do { \ + const void* dat = nullptr; \ + for (size_t i = 0; i < n_elements; ++i) { \ + if (tensor.type##_val_size() == 1) { \ + dat = reinterpret_cast(&tensor.type##_val()[0]); \ + } else { \ + dat = reinterpret_cast(&tensor.type##_val()[i]); \ + } \ + values->setElement(i, dat); \ + } \ + } while(0) + values->resize(n_elements); - for (size_t i = 0; i < n_elements; i++) - { - auto& tensor = node.attr().at("value").tensor(); - const void* dat = nullptr; - switch (dt) + auto& tensor = node.attr().at("value").tensor(); + size_t val_size; + if (dt == tensorflow::DT_INT32) { + GET_VALUES(int); + } else if (dt == tensorflow::DT_INT64) { + GET_VALUES(int64); + } else if (dt == tensorflow::DT_BOOL) { + GET_VALUES(bool); + } else if (dt == tensorflow::DT_HALF) { + GET_VALUES(half); + } else if (dt == tensorflow::DT_FLOAT) { + GET_VALUES(float); + } else if (dt == tensorflow::DT_DOUBLE) { + GET_VALUES(double); + } else if (dt == tensorflow::DT_STRING) { + values->resize(tensor.string_val()[0].length()); + auto it = tensor.string_val()[0].begin(); + for (size_t j = 0; it != tensor.string_val()[0].end(); ++j, ++it) { - // TODO(amprocte/NGRAPH-2502): there are more element types to support - // here - case tensorflow::DT_INT32: - dat = reinterpret_cast(&(tensor.int_val_size() == 1 - ? tensor.int_val()[0] - : tensor.int_val()[i])); - values->setElement(i, dat); - break; - case tensorflow::DT_INT64: - dat = reinterpret_cast(&(tensor.int64_val_size() == 1 - ? tensor.int64_val()[0] - : tensor.int64_val()[i])); - values->setElement(i, dat); - break; - case tensorflow::DT_FLOAT: - dat = reinterpret_cast(&(tensor.float_val_size() == 1 - ? tensor.float_val()[0] - : tensor.float_val()[i])); - values->setElement(i, dat); - break; - case tensorflow::DT_BOOL: - dat = reinterpret_cast(&(tensor.bool_val_size() == 1 - ? tensor.bool_val()[0] - : tensor.bool_val()[i])); - values->setElement(i, dat); - break; - case tensorflow::DT_DOUBLE: - dat = reinterpret_cast(&(tensor.double_val_size() == 1 - ? tensor.double_val()[0] - : tensor.double_val()[i])); - values->setElement(i, dat); - break; - case tensorflow::DT_STRING: - if (i > 0) - { - // TODO: only support one dimension for string type now - return false; - } - { - values->resize(tensor.string_val()[0].length()); - auto it = tensor.string_val()[0].begin(); - for (size_t j = 0; it != tensor.string_val()[0].end(); ++j, ++it) - { - values->setElement(j, reinterpret_cast(&it)); - } - } - break; - default: - return false; - // NGRAPH_VLOG(0) - // << "Const node has empty tensor and we don't know how to " - // "handle this element type"; - // NGRAPH_VLOG(0) << node.DebugString(); - // NGRAPH_VLOG(0) << shape.DebugString(); - // return errors::Unimplemented("Encountered unknown element type ", - // DataType_Name(dt), - // " on an empty tensor"); + values->setElement(j, reinterpret_cast(&it)); } + } else { + return false; } + +#undef GET_VALUES } else { @@ -372,7 +346,9 @@ namespace nnfusion try { - const auto& type = TF_NGRAPH_CONST_MAP.at(dtype); + element::Type type; + result = TFDataTypeToNNFusionElementType(dtype, &type); + NNFUSION_CHECK(result); result = MakeConstOp(node, type, &ng_node); NNFUSION_CHECK(result); } @@ -388,19 +364,19 @@ namespace nnfusion return ret; } - const std::map TF_NGRAPH_CONST_MAP = { - {tensorflow::DataType::DT_FLOAT, nnfusion::element::f32}, - {tensorflow::DataType::DT_DOUBLE, nnfusion::element::f64}, - {tensorflow::DataType::DT_INT8, nnfusion::element::i8}, - {tensorflow::DataType::DT_INT16, nnfusion::element::i16}, - {tensorflow::DataType::DT_INT32, nnfusion::element::i32}, - {tensorflow::DataType::DT_INT64, nnfusion::element::i64}, - {tensorflow::DataType::DT_UINT8, nnfusion::element::u8}, - {tensorflow::DataType::DT_UINT16, nnfusion::element::u16}, - {tensorflow::DataType::DT_UINT32, nnfusion::element::u32}, - {tensorflow::DataType::DT_UINT64, nnfusion::element::u64}, - {tensorflow::DataType::DT_BOOL, nnfusion::element::boolean}, - {tensorflow::DataType::DT_STRING, nnfusion::element::character}}; + // const std::map TF_NGRAPH_CONST_MAP = { + // {tensorflow::DataType::DT_FLOAT, nnfusion::element::f32}, + // {tensorflow::DataType::DT_DOUBLE, nnfusion::element::f64}, + // {tensorflow::DataType::DT_INT8, nnfusion::element::i8}, + // {tensorflow::DataType::DT_INT16, nnfusion::element::i16}, + // {tensorflow::DataType::DT_INT32, nnfusion::element::i32}, + // {tensorflow::DataType::DT_INT64, nnfusion::element::i64}, + // {tensorflow::DataType::DT_UINT8, nnfusion::element::u8}, + // {tensorflow::DataType::DT_UINT16, nnfusion::element::u16}, + // {tensorflow::DataType::DT_UINT32, nnfusion::element::u32}, + // {tensorflow::DataType::DT_UINT64, nnfusion::element::u64}, + // {tensorflow::DataType::DT_BOOL, nnfusion::element::boolean}, + // {tensorflow::DataType::DT_STRING, nnfusion::element::character}}; } // namespace tensorflow_import } // namespace frontend } // namespace nnfusion diff --git a/src/nnfusion/frontend/tensorflow_import/ops/const.hpp b/src/nnfusion/frontend/tensorflow_import/ops/const.hpp index 6b86a225d..827dc76be 100644 --- a/src/nnfusion/frontend/tensorflow_import/ops/const.hpp +++ b/src/nnfusion/frontend/tensorflow_import/ops/const.hpp @@ -19,7 +19,7 @@ namespace nnfusion const NodeMap& all_ng_nodes, std::shared_ptr m_ngraph); - extern const std::map TF_NGRAPH_CONST_MAP; + // extern const std::map TF_NGRAPH_CONST_MAP; } // namespace tensorflow_import } // namespace frontend } // namespace nnfusion diff --git a/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp b/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp index 12b325b5e..66ba79696 100644 --- a/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp +++ b/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp @@ -165,7 +165,7 @@ namespace nnfusion NNFUSION_CHECK(status); nnfusion::element::Type nnfusion_et; status = TFDataTypeToNNFusionElementType(dtype, &nnfusion_et); - NNFUSION_CHECK(status); + NNFUSION_CHECK(status) << "DataType " << dtype << " is not supported."; tensorflow::TensorShapeProto tf_shape = node.attr().at("shape").shape(); nnfusion::Shape ng_shape; status = TFTensorShapeToNGraphShape(tf_shape, &ng_shape); diff --git a/src/nnfusion/frontend/tensorflow_import/util/util.cpp b/src/nnfusion/frontend/tensorflow_import/util/util.cpp index e060cf97b..9531b06ea 100644 --- a/src/nnfusion/frontend/tensorflow_import/util/util.cpp +++ b/src/nnfusion/frontend/tensorflow_import/util/util.cpp @@ -14,6 +14,7 @@ namespace nnfusion { switch (tf_dt) { + case tensorflow::DataType::DT_HALF: *ng_et = nnfusion::element::f16; break; case tensorflow::DataType::DT_FLOAT: *ng_et = nnfusion::element::f32; break; case tensorflow::DataType::DT_DOUBLE: *ng_et = nnfusion::element::f64; break; case tensorflow::DataType::DT_INT8: *ng_et = nnfusion::element::i8; break; From de54196168067d366950a95c052d923fe4d2c837 Mon Sep 17 00:00:00 2001 From: Niupple Date: Wed, 25 Nov 2020 05:29:20 +0000 Subject: [PATCH 02/32] fix compiling error --- src/nnfusion/core/operators/generic_op/generic_op.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nnfusion/core/operators/generic_op/generic_op.hpp b/src/nnfusion/core/operators/generic_op/generic_op.hpp index 831457e32..2a9f36171 100644 --- a/src/nnfusion/core/operators/generic_op/generic_op.hpp +++ b/src/nnfusion/core/operators/generic_op/generic_op.hpp @@ -221,7 +221,7 @@ namespace nnfusion } else { - NNFUSION_CHECK_FAIL_WITH_EXCEPTION() << "Unhandled type: " << d_type.c_str(); + NNFUSION_CHECK_FAIL() << "Unhandled type: " << d_type; } auto shape = tensor->get_shape(); if (shape.size() == 0) From 272d39484cfa08a9f7716dd0cdf08783a35bcfab Mon Sep 17 00:00:00 2001 From: Niupple Date: Wed, 25 Nov 2020 12:42:56 +0000 Subject: [PATCH 03/32] changes for fp16 --- .../core/kernels/cpu/eigen/concat.cpp | 2 +- .../core/kernels/cpu/eigen/convolution.cpp | 2 +- src/nnfusion/core/kernels/cpu/eigen/dot.cpp | 2 +- .../core/kernels/cpu/eigen/elementwise.cpp | 2 +- src/nnfusion/core/kernels/cpu/eigen/lstm.cpp | 2 +- .../core/kernels/cpu/eigen/max_pool.cpp | 2 +- src/nnfusion/core/kernels/cpu/eigen/pad.cpp | 2 +- .../core/kernels/cpu/eigen/reduce.cpp | 2 +- .../core/kernels/cpu/eigen/softmax.cpp | 2 +- .../core/kernels/cpu/general/anyop.cpp | 2 +- .../core/kernels/cpu/general/reshape.cpp | 2 +- src/nnfusion/core/kernels/cpu/mkl/dot.cpp | 2 +- .../core/kernels/cpu/mlas/avg_pool.cpp | 2 +- .../core/kernels/cpu/mlas/batch_matmul.cpp | 2 +- .../core/kernels/cpu/mlas/convolution.cpp | 2 +- src/nnfusion/core/kernels/cpu/mlas/dot.cpp | 2 +- .../core/kernels/cpu/mlas/max_pool.cpp | 2 +- .../kernels/cpu/reference/batch_matmul.cpp | 2 +- .../core/kernels/cpu/reference/constant.cpp | 2 +- .../core/kernels/cpu/reference/kernels.cpp | 104 +++++++++--------- .../core/kernels/cpu/reference/one_hot.cpp | 2 +- .../core/kernels/cpu/reference/reduce_all.cpp | 2 +- .../kernels/cpu/reference/stop_gradient.cpp | 2 +- .../core/kernels/cpu/reference/transpose.cpp | 2 +- .../core/kernels/cpu/reference/variable.cpp | 2 +- .../core/kernels/cpu/simd/elementwise.cpp | 2 +- .../kernels/cpu/simd/elementwise_fused.cpp | 2 +- .../core/kernels/cuda_gpu/cuda_cudnn.cpp | 14 ++- .../core/kernels/cuda_gpu/cuda_cudnn.hpp | 8 +- .../core/kernels/cuda_gpu/cuda_langunit.cpp | 10 ++ .../core/kernels/cuda_gpu/cuda_langunit.hpp | 1 + .../cuda_gpu/inl/generate_kernel_code-inl.hpp | 2 +- .../cuda_gpu/kernels/adam_optimizer.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/addn.cpp | 2 +- .../kernels/cuda_gpu/kernels/allreduce.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/anyop.cpp | 2 +- .../kernels/cuda_gpu/kernels/apply_adam.cpp | 2 +- .../kernels/apply_gradient_descent.cpp | 4 +- .../cuda_gpu/kernels/apply_momentum.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/assign.cpp | 2 +- .../kernels/cuda_gpu/kernels/assign_sub.cpp | 2 +- .../kernels/cuda_gpu/kernels/avg_pool.cpp | 4 +- .../kernels/cuda_gpu/kernels/batch_matmul.cpp | 4 +- .../kernels/cuda_gpu/kernels/batch_norm.cpp | 6 +- .../cuda_gpu/kernels/blockfusion_fused.cpp | 2 +- .../kernels/cuda_gpu/kernels/broadcast.cpp | 4 +- .../core/kernels/cuda_gpu/kernels/concat.cpp | 4 +- .../cuda_gpu/kernels/concat_offset.cpp | 2 +- .../kernels/cuda_gpu/kernels/constant.cpp | 2 +- .../kernels/cuda_gpu/kernels/convolution.cpp | 22 +++- .../kernels/cuda_gpu/kernels/convolution.hpp | 1 + .../crossentropy_fwdbwd_softmax_bwd_large.cpp | 2 +- .../cuda_gpu/kernels/depthwise_conv2d.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/dot.cpp | 91 ++++++++++++++- .../core/kernels/cuda_gpu/kernels/dropout.cpp | 4 +- .../cuda_gpu/kernels/dynamic_stitch.cpp | 2 +- .../kernels/cuda_gpu/kernels/elementwise.cpp | 2 +- .../cuda_gpu/kernels/elementwise_fused.cpp | 2 +- .../kernels/cuda_gpu/kernels/gather_1d.cpp | 4 +- .../kernels/cuda_gpu/kernels/gather_nd.cpp | 4 +- .../cuda_gpu/kernels/invert_permutation.cpp | 2 +- .../kernels/cuda_gpu/kernels/layer_norm.cpp | 2 +- .../kernels/cuda_gpu/kernels/max_pool.cpp | 4 +- .../core/kernels/cuda_gpu/kernels/one_hot.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/pad.cpp | 4 +- .../core/kernels/cuda_gpu/kernels/range.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/reduce.cpp | 24 ++-- .../kernels/cuda_gpu/kernels/reduce_all.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/reshape.cpp | 8 +- .../core/kernels/cuda_gpu/kernels/result.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/reverse.cpp | 2 +- .../cuda_gpu/kernels/reverse_sequence.cpp | 4 +- .../kernels/rocm/batch_gemm_fixed.cpp | 2 +- .../cuda_gpu/kernels/rocm/broadcast_host.cpp | 2 +- .../kernels/rocm/broadcast_kernel.cpp | 2 +- .../cuda_gpu/kernels/rocm/convfwd_fixed.cpp | 2 +- .../cuda_gpu/kernels/rocm/convolution.cpp | 2 +- .../cuda_gpu/kernels/rocm/gemm_fixed.cpp | 2 +- .../cuda_gpu/kernels/rocm/reduce_sum.cpp | 2 +- .../kernels/cuda_gpu/kernels/rocm/softmax.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/scatter.cpp | 2 +- .../kernels/cuda_gpu/kernels/select_node.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/slice.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/softmax.cpp | 4 +- .../kernels/sparse_apply_momentum.cpp | 2 +- .../cuda_gpu/kernels/stop_gradient.cpp | 2 +- .../cuda_gpu/kernels/strided_slice_grad.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/tile.cpp | 4 +- .../kernels/cuda_gpu/kernels/transpose.cpp | 2 +- .../cuda_gpu/kernels/unsorted_segment_sum.cpp | 2 +- .../kernels/cuda_gpu/kernels/variable.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/zeros.cpp | 2 +- src/nnfusion/core/kernels/hlsl/constant.cpp | 2 +- src/nnfusion/core/kernels/hlsl/parameter.cpp | 2 +- src/nnfusion/core/kernels/hlsl/result.cpp | 2 +- .../core/kernels/kernel_registration.cpp | 7 +- .../core/kernels/kernel_registration.hpp | 9 +- .../engine/pass/codegen/cuda_codegen_pass.cpp | 1 + .../engine/pass/graph/kernel_selection.cpp | 6 +- .../frontend/tensorflow_import/ops/const.cpp | 64 ----------- test/main.cpp | 2 +- 101 files changed, 313 insertions(+), 259 deletions(-) diff --git a/src/nnfusion/core/kernels/cpu/eigen/concat.cpp b/src/nnfusion/core/kernels/cpu/eigen/concat.cpp index 89a517269..f7e3df997 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/concat.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/concat.cpp @@ -210,5 +210,5 @@ using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "Concat", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs cpu::ConcatEigen) diff --git a/src/nnfusion/core/kernels/cpu/eigen/convolution.cpp b/src/nnfusion/core/kernels/cpu/eigen/convolution.cpp index 67cbe1dd6..8bda08757 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/convolution.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/convolution.cpp @@ -144,5 +144,5 @@ using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "Convolution", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs cpu::ConvolutionEigen) diff --git a/src/nnfusion/core/kernels/cpu/eigen/dot.cpp b/src/nnfusion/core/kernels/cpu/eigen/dot.cpp index e3523225d..4a04d623b 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/dot.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/dot.cpp @@ -90,5 +90,5 @@ LanguageUnit_p cpu::Dot::emit_dependency() REGISTER_KERNEL_EMITTER( "Dot", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs cpu::Dot) diff --git a/src/nnfusion/core/kernels/cpu/eigen/elementwise.cpp b/src/nnfusion/core/kernels/cpu/eigen/elementwise.cpp index 94db71307..bfc79215f 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/elementwise.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/elementwise.cpp @@ -8,7 +8,7 @@ using namespace nnfusion::kernels; #define REGISTER_EW_KERNEL(OP_NAME) \ REGISTER_KERNEL_EMITTER("" #OP_NAME "", \ - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), \ + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), \ cpu::ElementwiseEigen); REGISTER_EW_KERNEL(Abs) diff --git a/src/nnfusion/core/kernels/cpu/eigen/lstm.cpp b/src/nnfusion/core/kernels/cpu/eigen/lstm.cpp index e29e8c456..8c3ed6046 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/lstm.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/lstm.cpp @@ -145,7 +145,7 @@ LanguageUnit_p cpu::LstmEigen::emit_dependency() REGISTER_KERNEL_EMITTER( "Lstm", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs cpu::LstmEigen) void cpu::LstmEigen::emit_compute_input_helper(nnfusion::codegen::CodeWriter& lu) diff --git a/src/nnfusion/core/kernels/cpu/eigen/max_pool.cpp b/src/nnfusion/core/kernels/cpu/eigen/max_pool.cpp index e8ed45698..cc55948d7 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/max_pool.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/max_pool.cpp @@ -169,5 +169,5 @@ using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "MaxPool", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs cpu::MaxPoolEigen) diff --git a/src/nnfusion/core/kernels/cpu/eigen/pad.cpp b/src/nnfusion/core/kernels/cpu/eigen/pad.cpp index 9e05125d4..aa82543a8 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/pad.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/pad.cpp @@ -8,5 +8,5 @@ using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "Pad", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs cpu::Pad) // constructor diff --git a/src/nnfusion/core/kernels/cpu/eigen/reduce.cpp b/src/nnfusion/core/kernels/cpu/eigen/reduce.cpp index 195574a31..86065f66c 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/reduce.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/reduce.cpp @@ -8,7 +8,7 @@ using namespace nnfusion::kernels; #define REGISTER_EW_KERNEL(OP_NAME) \ REGISTER_KERNEL_EMITTER("" #OP_NAME "", \ - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), \ + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), \ cpu::ReduceEigen); //REGISTER_EW_KERNEL(Sum) diff --git a/src/nnfusion/core/kernels/cpu/eigen/softmax.cpp b/src/nnfusion/core/kernels/cpu/eigen/softmax.cpp index 45546ddda..cb7a99800 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/softmax.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/softmax.cpp @@ -8,5 +8,5 @@ using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "Softmax", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs cpu::SoftmaxEigen) // constructor diff --git a/src/nnfusion/core/kernels/cpu/general/anyop.cpp b/src/nnfusion/core/kernels/cpu/general/anyop.cpp index 9ec7d6f06..de5620391 100644 --- a/src/nnfusion/core/kernels/cpu/general/anyop.cpp +++ b/src/nnfusion/core/kernels/cpu/general/anyop.cpp @@ -36,5 +36,5 @@ LanguageUnit_p cpu::AnyOP::emit_dependency() // Register Pad kernel emitter REGISTER_KERNEL_EMITTER("AnyOP", //op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Priority(2), //attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Priority(2), //attrs cpu::AnyOP) // constructor diff --git a/src/nnfusion/core/kernels/cpu/general/reshape.cpp b/src/nnfusion/core/kernels/cpu/general/reshape.cpp index a0774c5ca..f8e694d00 100644 --- a/src/nnfusion/core/kernels/cpu/general/reshape.cpp +++ b/src/nnfusion/core/kernels/cpu/general/reshape.cpp @@ -96,5 +96,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Reshape", //op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("cpu").Priority(2), //attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("cpu").Priority(2), //attrs cpu::ReshapeMemcpy) //constructor diff --git a/src/nnfusion/core/kernels/cpu/mkl/dot.cpp b/src/nnfusion/core/kernels/cpu/mkl/dot.cpp index cb414b767..eb4e77fa4 100644 --- a/src/nnfusion/core/kernels/cpu/mkl/dot.cpp +++ b/src/nnfusion/core/kernels/cpu/mkl/dot.cpp @@ -178,5 +178,5 @@ LanguageUnit_p cpu::DotMkl::emit_dependency() REGISTER_KERNEL_EMITTER( "Dot", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("mkl").Priority(3), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mkl").Priority(3), // attrs cpu::DotMkl) diff --git a/src/nnfusion/core/kernels/cpu/mlas/avg_pool.cpp b/src/nnfusion/core/kernels/cpu/mlas/avg_pool.cpp index ae8ca6a91..68f8c484e 100644 --- a/src/nnfusion/core/kernels/cpu/mlas/avg_pool.cpp +++ b/src/nnfusion/core/kernels/cpu/mlas/avg_pool.cpp @@ -109,5 +109,5 @@ LanguageUnit_p cpu::AvgPoolMlas::emit_dependency() REGISTER_KERNEL_EMITTER( "AvgPool", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("mlas").Priority(6), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs cpu::AvgPoolMlas) // constructor diff --git a/src/nnfusion/core/kernels/cpu/mlas/batch_matmul.cpp b/src/nnfusion/core/kernels/cpu/mlas/batch_matmul.cpp index 4e25e1bac..bcd21a959 100644 --- a/src/nnfusion/core/kernels/cpu/mlas/batch_matmul.cpp +++ b/src/nnfusion/core/kernels/cpu/mlas/batch_matmul.cpp @@ -115,5 +115,5 @@ LanguageUnit_p cpu::BatchMatMulMlas::emit_dependency() REGISTER_KERNEL_EMITTER( "BatchMatMul", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("mlas").Priority(6), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs cpu::BatchMatMulMlas) diff --git a/src/nnfusion/core/kernels/cpu/mlas/convolution.cpp b/src/nnfusion/core/kernels/cpu/mlas/convolution.cpp index ed5a9f594..48f635f5a 100644 --- a/src/nnfusion/core/kernels/cpu/mlas/convolution.cpp +++ b/src/nnfusion/core/kernels/cpu/mlas/convolution.cpp @@ -185,5 +185,5 @@ LanguageUnit_p cpu::ConvolutionMlas::emit_dependency() REGISTER_KERNEL_EMITTER( "Convolution", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("mlas").Priority(6), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs cpu::ConvolutionMlas) // constructor diff --git a/src/nnfusion/core/kernels/cpu/mlas/dot.cpp b/src/nnfusion/core/kernels/cpu/mlas/dot.cpp index 6f5fc3119..37ae88445 100644 --- a/src/nnfusion/core/kernels/cpu/mlas/dot.cpp +++ b/src/nnfusion/core/kernels/cpu/mlas/dot.cpp @@ -112,5 +112,5 @@ LanguageUnit_p cpu::DotMlas::emit_dependency() REGISTER_KERNEL_EMITTER( "Dot", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("mlas").Priority(6), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs cpu::DotMlas) diff --git a/src/nnfusion/core/kernels/cpu/mlas/max_pool.cpp b/src/nnfusion/core/kernels/cpu/mlas/max_pool.cpp index 419b8dfba..85e2bc94b 100644 --- a/src/nnfusion/core/kernels/cpu/mlas/max_pool.cpp +++ b/src/nnfusion/core/kernels/cpu/mlas/max_pool.cpp @@ -111,5 +111,5 @@ LanguageUnit_p cpu::MaxPoolMlas::emit_dependency() REGISTER_KERNEL_EMITTER( "MaxPool", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("mlas").Priority(6), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs cpu::MaxPoolMlas) // constructor diff --git a/src/nnfusion/core/kernels/cpu/reference/batch_matmul.cpp b/src/nnfusion/core/kernels/cpu/reference/batch_matmul.cpp index f8b9a0e99..bbff5a1bd 100644 --- a/src/nnfusion/core/kernels/cpu/reference/batch_matmul.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/batch_matmul.cpp @@ -114,7 +114,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "BatchMatMul", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs BatchMatMulRef) // constructor } // namespace cpu diff --git a/src/nnfusion/core/kernels/cpu/reference/constant.cpp b/src/nnfusion/core/kernels/cpu/reference/constant.cpp index e6ae61aa2..e7094a52f 100644 --- a/src/nnfusion/core/kernels/cpu/reference/constant.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/constant.cpp @@ -70,5 +70,5 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Constant", //op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT), //attrs + Device(GENERIC_CPU).TypeConstraint(element::f32), //attrs cpu::Constant) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cpu/reference/kernels.cpp b/src/nnfusion/core/kernels/cpu/reference/kernels.cpp index 13755c86f..896fa385e 100644 --- a/src/nnfusion/core/kernels/cpu/reference/kernels.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/kernels.cpp @@ -2137,7 +2137,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Abs", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs AbsRef) // constructor class AcosRef : public KernelEmitter @@ -2175,7 +2175,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Acos", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs AcosRef) // constructor class AddRef : public KernelEmitter @@ -2213,7 +2213,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Add", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs AddRef) // constructor class AllReduceRef : public KernelEmitter @@ -2252,7 +2252,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "AllReduce", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs AllReduceRef) // constructor class AsinRef : public KernelEmitter @@ -2290,7 +2290,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Asin", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs AsinRef) // constructor class AtanRef : public KernelEmitter @@ -2328,7 +2328,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Atan", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs AtanRef) // constructor class BroadcastRef : public KernelEmitter @@ -2368,7 +2368,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Broadcast", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs BroadcastRef) // constructor class CeilingRef : public KernelEmitter @@ -2406,7 +2406,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Ceiling", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs CeilingRef) // constructor class ConcatRef : public KernelEmitter @@ -2453,7 +2453,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Concat", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs ConcatRef) // constructor /* @@ -2489,7 +2489,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Constant", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs ConstantRef) // constructor */ @@ -2530,7 +2530,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Convert", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs ConvertRef) // constructor class ConvolutionRef : public KernelEmitter @@ -2575,7 +2575,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Convolution", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs ConvolutionRef) // constructor class CosRef : public KernelEmitter @@ -2613,7 +2613,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Cos", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs CosRef) // constructor class CoshRef : public KernelEmitter @@ -2651,7 +2651,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Cosh", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs CoshRef) // constructor class DivideRef : public KernelEmitter @@ -2689,7 +2689,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Divide", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs DivideRef) // constructor class EqualRef : public KernelEmitter @@ -2727,7 +2727,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Equal", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs EqualRef) // constructor class ExpRef : public KernelEmitter @@ -2765,7 +2765,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Exp", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs ExpRef) // constructor class FloorRef : public KernelEmitter @@ -2803,7 +2803,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Floor", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs FloorRef) // constructor class GreaterRef : public KernelEmitter @@ -2841,7 +2841,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Greater", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs GreaterRef) // constructor class LessRef : public KernelEmitter @@ -2879,7 +2879,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Less", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs LessRef) // constructor class LogRef : public KernelEmitter @@ -2917,7 +2917,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Log", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs LogRef) // constructor class LRNRef : public KernelEmitter @@ -2957,7 +2957,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "LRN", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs LRNRef) // constructor class MaxRef : public KernelEmitter @@ -2997,7 +2997,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Max", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs MaxRef) // constructor class MaximumRef : public KernelEmitter @@ -3035,7 +3035,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Maximum", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs MaximumRef) // constructor class MinRef : public KernelEmitter @@ -3075,7 +3075,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Min", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs MinRef) // constructor class MinimumRef : public KernelEmitter @@ -3113,7 +3113,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Minimum", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs MinimumRef) // constructor class MultiplyRef : public KernelEmitter @@ -3151,7 +3151,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Multiply", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs MultiplyRef) // constructor class NegativeRef : public KernelEmitter @@ -3189,7 +3189,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Negative", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs NegativeRef) // constructor class PowerRef : public KernelEmitter @@ -3227,7 +3227,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Power", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs PowerRef) // constructor class ProductRef : public KernelEmitter @@ -3267,7 +3267,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Product", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs ProductRef) // constructor class ReluRef : public KernelEmitter @@ -3305,7 +3305,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Relu", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs ReluRef) // constructor class SelectRef : public KernelEmitter @@ -3344,7 +3344,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Select", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs SelectRef) // constructor class SigmoidRef : public KernelEmitter @@ -3382,7 +3382,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Sigmoid", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs SigmoidRef) // constructor class SignRef : public KernelEmitter @@ -3420,7 +3420,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Sign", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs SignRef) // constructor class SinRef : public KernelEmitter @@ -3458,7 +3458,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Sin", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs SinRef) // constructor class SinhRef : public KernelEmitter @@ -3496,7 +3496,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Sinh", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs SinhRef) // constructor class SliceRef : public KernelEmitter @@ -3537,7 +3537,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Slice", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs SliceRef) // constructor class SoftmaxRef : public KernelEmitter @@ -3581,7 +3581,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Softmax", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs SoftmaxRef) // constructor class SqrtRef : public KernelEmitter @@ -3619,7 +3619,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Sqrt", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs SqrtRef) // constructor class SubtractRef : public KernelEmitter @@ -3657,7 +3657,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Subtract", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs SubtractRef) // constructor class SumRef : public KernelEmitter @@ -3697,7 +3697,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Sum", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs SumRef) // constructor class TanRef : public KernelEmitter @@ -3735,7 +3735,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Tan", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs TanRef) // constructor class TanhRef : public KernelEmitter @@ -3773,7 +3773,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Tanh", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs TanhRef) // constructor class BatchNormRef : public KernelEmitter @@ -3812,7 +3812,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "BatchNormInference", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs BatchNormRef) class AvgPoolRef : public KernelEmitter @@ -3856,7 +3856,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "AvgPool", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs AvgPoolRef) class DotRef : public KernelEmitter @@ -3897,7 +3897,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Dot", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs DotRef) class MaxPoolRef : public KernelEmitter @@ -3940,7 +3940,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "MaxPool", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs MaxPoolRef) class PadRef : public KernelEmitter @@ -3982,7 +3982,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Pad", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs PadRef) class ReshapeRef : public KernelEmitter @@ -4022,7 +4022,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Reshape", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs ReshapeRef) class ResultRef : public KernelEmitter @@ -4068,7 +4068,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Result", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs ResultRef) class LessEqRef : public KernelEmitter @@ -4106,7 +4106,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "LessEq", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs LessEqRef) class ReverseRef : public KernelEmitter @@ -4146,7 +4146,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Reverse", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs ReverseRef) } // namespace cpu diff --git a/src/nnfusion/core/kernels/cpu/reference/one_hot.cpp b/src/nnfusion/core/kernels/cpu/reference/one_hot.cpp index 4aeec8b82..ddaa06613 100644 --- a/src/nnfusion/core/kernels/cpu/reference/one_hot.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/one_hot.cpp @@ -69,7 +69,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "OneHot", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs OneHotRef) // constructor } // namespace cpu diff --git a/src/nnfusion/core/kernels/cpu/reference/reduce_all.cpp b/src/nnfusion/core/kernels/cpu/reference/reduce_all.cpp index 1f27075fb..20362bfad 100644 --- a/src/nnfusion/core/kernels/cpu/reference/reduce_all.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/reduce_all.cpp @@ -62,7 +62,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "All", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs AllRef) // constructor } // namespace cpu diff --git a/src/nnfusion/core/kernels/cpu/reference/stop_gradient.cpp b/src/nnfusion/core/kernels/cpu/reference/stop_gradient.cpp index 3bb049f38..531cea738 100644 --- a/src/nnfusion/core/kernels/cpu/reference/stop_gradient.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/stop_gradient.cpp @@ -55,7 +55,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "StopGradient", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs StopGradientRef) // constructor } // namespace cpu diff --git a/src/nnfusion/core/kernels/cpu/reference/transpose.cpp b/src/nnfusion/core/kernels/cpu/reference/transpose.cpp index e5b23869e..4b9508899 100644 --- a/src/nnfusion/core/kernels/cpu/reference/transpose.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/transpose.cpp @@ -114,7 +114,7 @@ namespace nnfusion REGISTER_KERNEL_EMITTER( "Transpose", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs TransposeRef) // constructor } // namespace cpu diff --git a/src/nnfusion/core/kernels/cpu/reference/variable.cpp b/src/nnfusion/core/kernels/cpu/reference/variable.cpp index 8ea594d1c..2b8bf9c0c 100644 --- a/src/nnfusion/core/kernels/cpu/reference/variable.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/variable.cpp @@ -68,5 +68,5 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Variable", //op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT), //attrs + Device(GENERIC_CPU).TypeConstraint(element::f32), //attrs cpu::Variable) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cpu/simd/elementwise.cpp b/src/nnfusion/core/kernels/cpu/simd/elementwise.cpp index 02dadaeef..cd725f542 100644 --- a/src/nnfusion/core/kernels/cpu/simd/elementwise.cpp +++ b/src/nnfusion/core/kernels/cpu/simd/elementwise.cpp @@ -8,7 +8,7 @@ using namespace nnfusion::kernels; #define REGISTER_EW_KERNEL(OP_NAME) \ REGISTER_KERNEL_EMITTER("" #OP_NAME "", \ - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("simd").Priority(5), \ + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("simd").Priority(5), \ cpu::ElementwiseSimd); REGISTER_EW_KERNEL(Abs) diff --git a/src/nnfusion/core/kernels/cpu/simd/elementwise_fused.cpp b/src/nnfusion/core/kernels/cpu/simd/elementwise_fused.cpp index 6643e9b23..1423244bf 100644 --- a/src/nnfusion/core/kernels/cpu/simd/elementwise_fused.cpp +++ b/src/nnfusion/core/kernels/cpu/simd/elementwise_fused.cpp @@ -439,5 +439,5 @@ LanguageUnit_p ElementwiseFused::emit_comments() REGISTER_KERNEL_EMITTER( "ElementwiseFused", // op_name - Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("simd").Priority(5), // attrs + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("simd").Priority(5), // attrs cpu::ElementwiseFused) diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp index 7d21aead2..b0ebd6c8b 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp @@ -20,6 +20,7 @@ std::vector cuda::compute_strides(const std::vector& shape) std::string cuda::get_cudnn_datatype(std::string dtype) { static const std::unordered_map datatype_map{ + {"half", "CUDNN_DATA_HALF"}, {"float", "CUDNN_DATA_FLOAT"}, {"double", "CUDNN_DATA_DOUBLE"}, {"int8_t", "CUDNN_DATA_INT8"}, @@ -30,11 +31,11 @@ std::string cuda::get_cudnn_datatype(std::string dtype) return p->second; } -LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape, string desc) +LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape, string desc, string type) { LanguageUnit_p _lu(new LanguageUnit); auto& lu = *_lu; - string data_type = "CUDNN_DATA_FLOAT"; //cuda::get_cudnn_datatype(type); + string data_type = cuda::get_cudnn_datatype(type); string tensor_format = "CUDNN_TENSOR_NCHW"; lu << "cudnnTensorDescriptor_t " << desc << ";\n"; lu << "CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&" << desc << "));\n"; @@ -91,12 +92,12 @@ LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& s return _lu; } -LanguageUnit_p cuda::get_cudnn_filter_descriptor(const Shape& shape, string desc) +LanguageUnit_p cuda::get_cudnn_filter_descriptor(const Shape& shape, string desc, string type) { LanguageUnit_p _lu(new LanguageUnit); auto& lu = *_lu; - string data_type = "CUDNN_DATA_FLOAT"; //cuda::get_cudnn_datatype(type); + string data_type = cuda::get_cudnn_datatype(type); string tensor_format = "CUDNN_TENSOR_NCHW"; lu << "cudnnFilterDescriptor_t " << desc << ";\n"; lu << "CUDNN_SAFE_CALL(cudnnCreateFilterDescriptor(&" << desc << "));\n"; @@ -143,12 +144,13 @@ LanguageUnit_p cuda::get_cudnn_filter_descriptor(const Shape& shape, string desc LanguageUnit_p cuda::get_cudnn_convolution_descriptor(const Shape& padding, const Strides& window_movement_strides, const Strides& window_dilation_strides, - string desc) + string desc, + string type) { LanguageUnit_p _lu(new LanguageUnit); auto& lu = *_lu; - string data_type = "CUDNN_DATA_FLOAT"; //cuda::get_cudnn_datatype(type); + string data_type = cuda::get_cudnn_datatype(type); string tensor_format = "CUDNN_TENSOR_NCHW"; lu << "cudnnConvolutionDescriptor_t " << desc << ";\n"; lu << "CUDNN_SAFE_CALL(cudnnCreateConvolutionDescriptor(&" << desc << "));\n"; diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp index b127e5aa5..63d63ce66 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp @@ -15,12 +15,14 @@ namespace nnfusion std::vector compute_strides(const std::vector& shape); std::string get_cudnn_datatype(std::string dtype); LanguageUnit_p cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape, - string desc); + string desc, + string type = "float"); LanguageUnit_p get_cudnn_convolution_descriptor(const Shape& padding, const Strides& window_movement_strides, const Strides& window_dilation_strides, - string desc); - LanguageUnit_p get_cudnn_filter_descriptor(const Shape& shape, string desc); + string desc, + string type = "float"); + LanguageUnit_p get_cudnn_filter_descriptor(const Shape& shape, string desc, string type = "float"); LanguageUnit_p get_dropout_global_states(float ratio); inline std::string ratio2str(float ratio) { diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp index 6d21791ab..62f1bf6cc 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp @@ -14,6 +14,7 @@ LU_DEFINE(header::cudnn, "#include \n"); LU_DEFINE(header::super_scaler, "#include \"super_scaler.h\"\n"); LU_DEFINE(header::cupti, "#include \n"); LU_DEFINE(header::cuda_prof_api, "#include \n"); +LU_DEFINE(header::cuda_fp16, "#include "); // Macro LU_DEFINE( @@ -223,6 +224,15 @@ __device__ __forceinline__ float load(const float* __restrict__ in, int i=0, b } return v; } +__device__ __forceinline__ half load(const half* __restrict__ in, int i=0, bool b=true) +{ + half v = 0.0f; + if (b) + { + v = __ldg(in + i); + } + return v; +} __device__ __forceinline__ int32_t load(const int32_t* __restrict__ in, int i=0, bool b=true) { int32_t v = 0; diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp index 48ab0eb3c..ecaeb6c89 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp @@ -16,6 +16,7 @@ namespace nnfusion LU_DECLARE(super_scaler); LU_DECLARE(cupti); LU_DECLARE(cuda_prof_api); + LU_DECLARE(cuda_fp16); } // namespace header namespace macro diff --git a/src/nnfusion/core/kernels/cuda_gpu/inl/generate_kernel_code-inl.hpp b/src/nnfusion/core/kernels/cuda_gpu/inl/generate_kernel_code-inl.hpp index c97f329fa..9ef0d123c 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/inl/generate_kernel_code-inl.hpp +++ b/src/nnfusion/core/kernels/cuda_gpu/inl/generate_kernel_code-inl.hpp @@ -65,5 +65,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER(__KernelOpType__, // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel"), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), // attrs cuda::__KernelUniqueClassName__) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/adam_optimizer.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/adam_optimizer.cpp index 7cb3efed8..8008f9d62 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/adam_optimizer.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/adam_optimizer.cpp @@ -113,5 +113,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("AdamOptimizer", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel"), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), cuda::AdamOptimizer) \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/addn.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/addn.cpp index 90a5fa94f..b19c6d2ae 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/addn.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/addn.cpp @@ -90,5 +90,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("AddN", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), cuda::AddN) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/allreduce.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/allreduce.cpp index 9e5ccc8f5..525968ab2 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/allreduce.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/allreduce.cpp @@ -50,5 +50,5 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("AllReduce", //op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Priority(2), //attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs cuda::SuperScalerAllReduce) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/anyop.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/anyop.cpp index 29de6ccb7..1adc0952c 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/anyop.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/anyop.cpp @@ -35,5 +35,5 @@ LanguageUnit_p cuda::AnyOP::emit_dependency() // Register Pad kernel emitter REGISTER_KERNEL_EMITTER("AnyOP", //op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Priority(2), //attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs cuda::AnyOP) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp index 0ad0fee4e..566e1b535 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp @@ -106,5 +106,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("ApplyAdam", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), cuda::ApplyAdam) \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_gradient_descent.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_gradient_descent.cpp index 6dbb94934..05dcc1087 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_gradient_descent.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_gradient_descent.cpp @@ -76,8 +76,8 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("ApplyGradient", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), cuda::ApplyGradientDescent) REGISTER_KERNEL_EMITTER("ApplyGradientDescent", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), cuda::ApplyGradientDescent) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_momentum.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_momentum.cpp index 7d3e76f74..72716bd4e 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_momentum.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_momentum.cpp @@ -80,5 +80,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("ApplyMomentum", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), cuda::ApplyMomentum) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/assign.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/assign.cpp index 867ef132b..f22ba886b 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/assign.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/assign.cpp @@ -76,5 +76,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Assign", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), cuda::Assign) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/assign_sub.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/assign_sub.cpp index 00bfb66ec..d21de903d 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/assign_sub.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/assign_sub.cpp @@ -76,5 +76,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("AssignSub", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), cuda::AssignSub) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp index 81be99d8b..bc2c56b1b 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp @@ -405,10 +405,10 @@ LanguageUnit_p cuda::AvgPoolmD::emit_function_signature() REGISTER_KERNEL_EMITTER( "AvgPool", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::AvgPool1D) // constructor REGISTER_KERNEL_EMITTER( "AvgPool", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs cuda::AvgPoolmD) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp index bcec70235..ad81ec3d8 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp @@ -193,10 +193,10 @@ using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "BatchMatMul", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::BatchMatMul) // constructor REGISTER_KERNEL_EMITTER( "BatchMatMul", // op_name - Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::BatchMatMul) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp index 352deab10..6a64bee78 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp @@ -202,11 +202,11 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("BatchNormInference", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn").Priority(2), // attrs cuda::BatchNorm) // constructor REGISTER_KERNEL_EMITTER("BatchNormInference", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda").Priority(2), // attrs cuda::BatchNormNCHW) // constructor REGISTER_KERNEL_EMITTER("BatchNormInference", // op_name - Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cuda").Priority(2), // attrs + Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda").Priority(2), // attrs cuda::BatchNormNCHW) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/blockfusion_fused.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/blockfusion_fused.cpp index 2df9329d8..22cea1999 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/blockfusion_fused.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/blockfusion_fused.cpp @@ -72,5 +72,5 @@ void BlockFusionFused::set_launch_config() REGISTER_KERNEL_EMITTER( "BlockFusionFused", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::BlockFusionFused) \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/broadcast.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/broadcast.cpp index 060004fb1..08660ed41 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/broadcast.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/broadcast.cpp @@ -195,9 +195,9 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Broadcast", //op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Priority(2), //attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs cuda::Broadcast) // constructor REGISTER_KERNEL_EMITTER("Broadcast", //op_name - Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Priority(2), //attrs + Device(ROCM_GPU).TypeConstraint(element::f32).Priority(2), //attrs cuda::RocmBroadcast) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/concat.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/concat.cpp index dc06709fd..3b39408e0 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/concat.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/concat.cpp @@ -325,7 +325,7 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Concat", //op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT), //attrs + Device(CUDA_GPU).TypeConstraint(element::f32), //attrs cuda::Concat) // constructor namespace nnfusion @@ -480,5 +480,5 @@ namespace nnfusion } // namespace nnfusion REGISTER_KERNEL_EMITTER("Concat", //op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT), //attrs + Device(CUDA_GPU).TypeConstraint(element::f32), //attrs cuda::ConcatKernel) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/concat_offset.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/concat_offset.cpp index 0a4af44c9..586091381 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/concat_offset.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/concat_offset.cpp @@ -74,5 +74,5 @@ using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "ConcatOffset", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::ConcatOffset) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp index e1f1d5d14..d6e0d9000 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp @@ -120,5 +120,5 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Constant", //op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Priority(2), //attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs cuda::Constant) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp index a8053d74d..7ddcb0426 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp @@ -12,6 +12,12 @@ cuda::ConvolutionCudnn::ConvolutionCudnn(shared_ptr ctx) { auto conv = static_pointer_cast(ctx->gnode->get_op_ptr()); + input_type = ctx->inputs[0]->get_element_type(); + filter_type = ctx->inputs[1]->get_element_type(); + output_type = ctx->outputs[0]->get_element_type(); + NNFUSION_CHECK(input_type == filter_type && input_type == output_type) + << "Convolution input datatype (" << input_type << ") should be the same with that of filter (" << filter_type << "), and that of output (" << output_type << ")."; + conv_type = input_type; input_shape = ctx->inputs[0]->get_shape(); filter_shape = ctx->inputs[1]->get_shape(); output_shape = ctx->outputs[0]->get_shape(); @@ -79,13 +85,14 @@ LanguageUnit_p cuda::ConvolutionCudnn::emit_function_body() padding_below[i] = static_cast(padding_below_diff[i]); } + { // lu << "cudnnDataType_t data_type = " << get_cudnn_datatype(dtype) << ";\n"; - lu << cudnn_tensor_descriptor_from_shape(input_shape, "tensor_desc_0")->get_code(); - lu << cudnn_tensor_descriptor_from_shape(output_shape, "tensor_desc_1")->get_code(); - lu << get_cudnn_filter_descriptor(filter_shape, "filter_desc")->get_code(); + lu << cudnn_tensor_descriptor_from_shape(input_shape, "tensor_desc_0", input_type)->get_code(); + lu << cudnn_tensor_descriptor_from_shape(output_shape, "tensor_desc_1", output_type)->get_code(); + lu << get_cudnn_filter_descriptor(filter_shape, "filter_desc", filter_type)->get_code(); lu << get_cudnn_convolution_descriptor( - padding_below, window_movement_strides, window_dilation_strides, "conv_desc") + padding_below, window_movement_strides, window_dilation_strides, "conv_desc", conv_type) ->get_code(); lu << R"( @@ -207,5 +214,10 @@ LanguageUnit_p cuda::ConvolutionCudnn::emit_function_signature() REGISTER_KERNEL_EMITTER( "Convolution", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs + cuda::ConvolutionCudnn) // constructor + +REGISTER_KERNEL_EMITTER( + "Convolution", // op_name + Device(CUDA_GPU).TypeConstraint(element::f16).Tag("cudnn_kernel").Priority(2), // attrs cuda::ConvolutionCudnn) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.hpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.hpp index 654cae2ca..444f2743a 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.hpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.hpp @@ -22,6 +22,7 @@ namespace nnfusion bool require_cudnn_handle() override { return true; } private: nnfusion::Shape input_shape, filter_shape, output_shape; + element::Type input_type, filter_type, output_type, conv_type; nnfusion::Strides window_dilation_strides, window_movement_strides, data_dilation_strides; nnfusion::CoordinateDiff padding_below_diff, padding_above_diff; diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/crossentropy_fwdbwd_softmax_bwd_large.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/crossentropy_fwdbwd_softmax_bwd_large.cpp index 8eadc9d1a..b8bf9699f 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/crossentropy_fwdbwd_softmax_bwd_large.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/crossentropy_fwdbwd_softmax_bwd_large.cpp @@ -78,5 +78,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("CrossEntropyFwdBwdWithSoftmaxBwdLarge", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel"), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), cuda::CrossEntropyFwdBwdWithSoftmaxBwdLarge) \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp index 636c8ba2b..2b1dce365 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp @@ -373,5 +373,5 @@ LanguageUnit_p cuda::DepthwiseConv2dNative::emit_dependency() REGISTER_KERNEL_EMITTER( "DepthwiseConv2dNative", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::DepthwiseConv2dNative) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp index 973204f87..7b7a3d606 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp @@ -30,6 +30,7 @@ LanguageUnit_p cuda::Dot::emit_function_body() auto gemm = static_pointer_cast(ctx->gnode->get_op_ptr()); auto trans_A = gemm->get_transpose_A(); auto trans_B = gemm->get_transpose_B(); + auto dtype = ctx->outputs[0]->get_element_type(); LanguageUnit_p _lu(new LanguageUnit(get_function_name())); auto& lu = *_lu; @@ -38,6 +39,7 @@ LanguageUnit_p cuda::Dot::emit_function_body() // void kernel(m_context->dtypes[0]* input0, m_context->dtypes[0]* input1, m_context->dtypes[2]* output0) //lu.block_begin(); + if (dtype == element::f32) { // case 1: Scalar * Tensor if (arg0_shape.empty() || arg1_shape.empty()) @@ -201,6 +203,86 @@ LanguageUnit_p cuda::Dot::emit_function_body() << " static_cast(output0)," << " " << n << "));\n"; } + } else if (dtype == element::f16) { + size_t axes_for_m_count = arg0_shape.size() - reduction_axes; + size_t axes_for_n_count = arg1_shape.size() - reduction_axes; + size_t axes_for_k_count = reduction_axes; + size_t m = 1; + size_t n = 1; + size_t k = 1; + + // check if input and output size correct + // check and calculate k for arg0 and arg1 + size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k + size_t arg1_k_idx = 0; // first axe in arg1 for k + + for (size_t i = 0; i < axes_for_k_count; i++) + { + k *= arg0_shape[arg0_k_idx]; + if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++]) + { + std::vector arg_vec{"arg0", "arg1"}; + std::vector shape_vec{arg0_shape, arg1_shape}; + + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } + } + // check and calculate m for arg0 and out + size_t arg0_m_idx = 0; // first axe in arg0 for m + size_t out_m_idx = 0; // first axe in out for m + for (size_t i = 0; i < axes_for_m_count; i++) + { + m *= arg0_shape[arg0_m_idx]; + if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++]) + { + std::vector arg_vec{"arg0", "output"}; + std::vector shape_vec{arg0_shape, out_shape}; + + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } + } + // check and calculate n for arg1 and out + size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n + size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n + for (size_t i = 0; i < axes_for_n_count; i++) + { + n *= arg1_shape[arg1_n_idx]; + if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++]) + { + std::vector arg_vec{"arg1", "output"}; + std::vector shape_vec{arg1_shape, out_shape}; + + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } + } + + lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; + + lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," + << " CUBLAS_OP_N," + << " CUBLAS_OP_N," + << " " << n << "," + << " " << m << "," + << " " << k << "," + << " &alpha," + << " static_cast(input1)," + << " " << n << "," + << " static_cast(input0)," + << " " << k << "," + << " &beta," + << " static_cast(output0)," + << " " << n << "));\n"; + } else { + NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot." } //lu.block_end(); return _lu; @@ -256,10 +338,15 @@ LanguageUnit_p cuda::Dot::emit_function_signature() REGISTER_KERNEL_EMITTER( "Dot", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cublas").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs + cuda::Dot) // constructor + +REGISTER_KERNEL_EMITTER( + "Dot", // op_name + Device(CUDA_GPU).TypeConstraint(element::f16).Tag("cublas").Priority(2), // attrs cuda::Dot) // constructor REGISTER_KERNEL_EMITTER( "Dot", // op_name - Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cublas").Priority(2), // attrs + Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs cuda::Dot) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dropout.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dropout.cpp index b827eef70..25b4c51ce 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dropout.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dropout.cpp @@ -254,9 +254,9 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("DropoutTraining", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn"), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn"), // attrs cuda::DropoutTraining) // constructor REGISTER_KERNEL_EMITTER("DropoutTrainingGrad", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn"), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn"), // attrs cuda::DropoutTrainingGrad) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp index b4caa5e3c..96385a2f2 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp @@ -123,5 +123,5 @@ LanguageUnit_p cuda::DynamicStitch::emit_dependency() REGISTER_KERNEL_EMITTER( "DynamicStitch", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::DynamicStitch) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.cpp index 949bc78a2..9597d122e 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.cpp @@ -9,7 +9,7 @@ using namespace nnfusion::kernels; #define REGISTER_EW_KERNEL(OP_NAME) \ REGISTER_KERNEL_EMITTER( \ "" #OP_NAME "", \ - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("element_wise").Priority(2), \ + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("element_wise").Priority(2), \ cuda::ElementWise); REGISTER_EW_KERNEL(Abs) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp index d8a104fa4..c4c3b0bdc 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp @@ -341,5 +341,5 @@ void ElementWiseFused::compute_best_config(int& grids, int& blocks, int& bound) REGISTER_KERNEL_EMITTER( "ElementWiseFused", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::ElementWiseFused) \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_1d.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_1d.cpp index 0bbaa8366..94fa0e506 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_1d.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_1d.cpp @@ -116,7 +116,7 @@ LanguageUnit_p cuda::Gather1D::emit_dependency() REGISTER_KERNEL_EMITTER( "GatherV2", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::Gather1D) // constructor cuda::Gather1DGrad::Gather1DGrad(shared_ptr ctx) @@ -229,5 +229,5 @@ LanguageUnit_p cuda::Gather1DGrad::emit_dependency() REGISTER_KERNEL_EMITTER( "GatherGrad", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::Gather1DGrad) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_nd.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_nd.cpp index 70be110e2..6743691cf 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_nd.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_nd.cpp @@ -279,9 +279,9 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("GatherND", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel"), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), // attrs cuda::GatherND) // constructor REGISTER_KERNEL_EMITTER("GatherNDGrad", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel"), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), // attrs cuda::GatherNDGrad) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/invert_permutation.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/invert_permutation.cpp index 9e119a500..6dd9bba2e 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/invert_permutation.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/invert_permutation.cpp @@ -65,6 +65,6 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("InvertPermutation", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Priority( + Device(CUDA_GPU).TypeConstraint(element::f32).Priority( 2), // TODO: this op input and output will all be int cuda::InvertPermutation) \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/layer_norm.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/layer_norm.cpp index 7a44d730a..bc9a6f463 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/layer_norm.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/layer_norm.cpp @@ -78,5 +78,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("LayerNorm", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudalib"), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudalib"), // attrs cuda::LayerNorm) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp index 49be708e2..ed603d218 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp @@ -241,10 +241,10 @@ LanguageUnit_p cuda::MaxPoolmD::emit_function_signature() REGISTER_KERNEL_EMITTER( "MaxPool", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::MaxPool1D) // constructor REGISTER_KERNEL_EMITTER( "MaxPool", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs cuda::MaxPoolmD) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/one_hot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/one_hot.cpp index e0fea7288..1368e1244 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/one_hot.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/one_hot.cpp @@ -110,5 +110,5 @@ using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "OneHot", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::OneHot) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp index 68434d486..0ac7149dc 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp @@ -138,7 +138,7 @@ KernelRegistrar kernel_registrar0( "Pad", Name("Pad") .Device(CUDA_GPU) - .TypeConstraint(DT_FLOAT) + .TypeConstraint(element::f32) .Tag("cuda_kernel") .KernelFactory([](shared_ptr context) -> shared_ptr { return make_shared(context); @@ -148,5 +148,5 @@ KernelRegistrar kernel_registrar0( REGISTER_KERNEL_EMITTER( "Pad", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::Pad) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp index 4eb1d090c..397eeb9b4 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp @@ -64,5 +64,5 @@ LanguageUnit_p cuda::Range::emit_dependency() } REGISTER_KERNEL_EMITTER( "Range", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::Range) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.cpp index ae0b6097c..15dd2d3ce 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.cpp @@ -7,55 +7,55 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Max", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), cuda::Reduce) REGISTER_KERNEL_EMITTER( "Max", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_lib").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs cuda::ReduceMemcpy) REGISTER_KERNEL_EMITTER("Min", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), cuda::Reduce) REGISTER_KERNEL_EMITTER( "Min", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_lib").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs cuda::ReduceMemcpy) REGISTER_KERNEL_EMITTER("Product", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), cuda::Reduce) REGISTER_KERNEL_EMITTER( "Product", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_lib").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs cuda::ReduceMemcpy) REGISTER_KERNEL_EMITTER("Sum", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), cuda::Reduce) REGISTER_KERNEL_EMITTER( "Sum", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_lib").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs cuda::ReduceMemcpy) REGISTER_KERNEL_EMITTER("Sum", - Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), + Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), cuda::Reduce) REGISTER_KERNEL_EMITTER( "Sum", // op_name - Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_lib").Priority(2), // attrs + Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs cuda::ReduceMemcpy) REGISTER_KERNEL_EMITTER("ReduceAny", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), cuda::Reduce) REGISTER_KERNEL_EMITTER( "ReduceAny", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_lib").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs cuda::ReduceMemcpy) \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce_all.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce_all.cpp index 2c94e1335..6760325ec 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce_all.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce_all.cpp @@ -110,5 +110,5 @@ using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "All", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::All) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reshape.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reshape.cpp index caf0b0f17..168a91011 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reshape.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reshape.cpp @@ -558,20 +558,20 @@ LanguageUnit_p cuda::ReshapeMemcpy::emit_function_signature() REGISTER_KERNEL_EMITTER( "Reshape", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel_2D").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel_2D").Priority(2), // attrs cuda::Reshape2D) // constructor REGISTER_KERNEL_EMITTER( "Reshape", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel_3D").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel_3D").Priority(2), // attrs cuda::Reshape3D) // constructor REGISTER_KERNEL_EMITTER( "Reshape", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel_D").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel_D").Priority(2), // attrs cuda::ReshapehD) // constructor REGISTER_KERNEL_EMITTER( "Reshape", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_lib").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs cuda::ReshapeMemcpy) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp index 61d57f210..46e81ade7 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp @@ -85,5 +85,5 @@ LanguageUnit_p cuda::Result::emit_dependency() REGISTER_KERNEL_EMITTER( "Result", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_lib").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs cuda::Result) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp index 56df9f1de..36f2d39b5 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp @@ -102,5 +102,5 @@ LanguageUnit_p cuda::Reverse::emit_dependency() REGISTER_KERNEL_EMITTER( "Reverse", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::Reverse) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp index dc14f153b..6f0e0e2bc 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp @@ -125,9 +125,9 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "ReverseSequence", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::ReverseSequence) // constructor REGISTER_KERNEL_EMITTER("ReverseSequence", // op_name - Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Priority(2), // attrs + Device(ROCM_GPU).TypeConstraint(element::f32).Priority(2), // attrs cuda::RocmReverseSequence) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/batch_gemm_fixed.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/batch_gemm_fixed.cpp index 4718b9127..5ed2d8ea4 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/batch_gemm_fixed.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/batch_gemm_fixed.cpp @@ -113,5 +113,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("BatchMatMul", // op_name - Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Priority(4), // attrs + Device(ROCM_GPU).TypeConstraint(element::f32).Priority(4), // attrs cuda::BatchGemmFixed) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_host.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_host.cpp index 9cc765ba2..63dd091e4 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_host.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_host.cpp @@ -173,5 +173,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Broadcast", //op_name - Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Priority(3), //attrs + Device(ROCM_GPU).TypeConstraint(element::f32).Priority(3), //attrs cuda::RocmBiasBroadcast) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_kernel.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_kernel.cpp index b88ce8ab3..f5f8b3a50 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_kernel.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_kernel.cpp @@ -296,5 +296,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Broadcast", // op_name - Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Priority(4), // attrs + Device(ROCM_GPU).TypeConstraint(element::f32).Priority(4), // attrs cuda::RocmManualBroadcast) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convfwd_fixed.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convfwd_fixed.cpp index c9b54b46f..04fb041b8 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convfwd_fixed.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convfwd_fixed.cpp @@ -142,5 +142,5 @@ using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "Convolution", // op_name - Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::ConvFwdFixed) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convolution.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convolution.cpp index c8920754e..2b19b8db0 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convolution.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convolution.cpp @@ -227,5 +227,5 @@ using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "Convolution", // op_name - Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn_kernel").Priority(2), // attrs + Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs cuda::RocmConvolutionCudnn) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/gemm_fixed.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/gemm_fixed.cpp index 15cfecdc5..983b7fce3 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/gemm_fixed.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/gemm_fixed.cpp @@ -208,5 +208,5 @@ using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "Dot", // op_name - Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::GemmFixed) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/reduce_sum.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/reduce_sum.cpp index af26b29a9..a476ddd06 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/reduce_sum.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/reduce_sum.cpp @@ -335,7 +335,7 @@ using namespace nnfusion::kernels; #define REGISTER_GPU_KERNEL(KEY, OP_NAME) \ REGISTER_KERNEL_EMITTER(KEY, \ - Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Priority(4), \ + Device(ROCM_GPU).TypeConstraint(element::f32).Priority(4), \ cuda::RocmReduce) REGISTER_GPU_KERNEL("Sum", Add) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/softmax.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/softmax.cpp index b68c94011..a2a54782d 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/softmax.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/softmax.cpp @@ -135,5 +135,5 @@ using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "Softmax", // op_name - Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::RocmSoftmax) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/scatter.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/scatter.cpp index 29a0d2090..fb2fd5930 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/scatter.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/scatter.cpp @@ -98,7 +98,7 @@ using namespace nnfusion::kernels; #define REGISTER_SCATTER_KERNEL(OP_NAME, KERNEL_NAME) \ REGISTER_KERNEL_EMITTER("" #KERNEL_NAME "", \ - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("scatter").Priority(2), \ + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("scatter").Priority(2), \ cuda::Scatter); REGISTER_SCATTER_KERNEL(Subtract, ScatterSub); diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/select_node.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/select_node.cpp index 33cc8bed3..c55ac7a3f 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/select_node.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/select_node.cpp @@ -72,5 +72,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("SelectNode", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel"), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), cuda::SelectNode) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/slice.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/slice.cpp index 9bb7e9507..d5bb5910d 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/slice.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/slice.cpp @@ -139,5 +139,5 @@ LanguageUnit_p cuda::Slice::emit_dependency() REGISTER_KERNEL_EMITTER( "Slice", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::Slice) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp index 16287054b..0dba0aec4 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp @@ -133,7 +133,7 @@ LanguageUnit_p cuda::Softmax::emit_function_signature() REGISTER_KERNEL_EMITTER( "Softmax", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs cuda::Softmax) // constructor cuda::SoftmaxGrad::SoftmaxGrad(shared_ptr ctx) @@ -267,5 +267,5 @@ LanguageUnit_p cuda::SoftmaxGrad::emit_function_signature() REGISTER_KERNEL_EMITTER( "SoftmaxGrad", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs cuda::SoftmaxGrad) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/sparse_apply_momentum.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/sparse_apply_momentum.cpp index 836d23f0b..063716257 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/sparse_apply_momentum.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/sparse_apply_momentum.cpp @@ -127,5 +127,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("SparseApplyMomentum", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), cuda::SparseApplyMomentum) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/stop_gradient.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/stop_gradient.cpp index c89bdfadb..6f7192da6 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/stop_gradient.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/stop_gradient.cpp @@ -116,5 +116,5 @@ using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "StopGradient", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::StopGradient) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp index d305cd12e..d796ef822 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp @@ -114,5 +114,5 @@ LanguageUnit_p cuda::StridedSliceGrad::emit_dependency() } REGISTER_KERNEL_EMITTER( "StridedSliceGrad", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::StridedSliceGrad) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp index 75d4ad303..4b22d7c25 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp @@ -147,9 +147,9 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Tile", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), cuda::Tile) REGISTER_KERNEL_EMITTER("Tile", //op_name - Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Priority(2), //attrs + Device(ROCM_GPU).TypeConstraint(element::f32).Priority(2), //attrs cuda::RocmTile) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/transpose.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/transpose.cpp index 19ae56c60..7016e0518 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/transpose.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/transpose.cpp @@ -144,5 +144,5 @@ using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "Transpose", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::Transpose) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/unsorted_segment_sum.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/unsorted_segment_sum.cpp index 782a4bf44..e816cd7b9 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/unsorted_segment_sum.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/unsorted_segment_sum.cpp @@ -186,5 +186,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("UnsortedSegmentSum", - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), cuda::UnsortedSegmentSum) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp index f829a7e6d..368e24241 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp @@ -78,5 +78,5 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Variable", //op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Priority(2), //attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs cuda::Variable) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/zeros.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/zeros.cpp index 547994202..0a457b435 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/zeros.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/zeros.cpp @@ -59,5 +59,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Zeros", // op_name - Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel"), // attrs + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), // attrs cuda::Zeros) // constructor diff --git a/src/nnfusion/core/kernels/hlsl/constant.cpp b/src/nnfusion/core/kernels/hlsl/constant.cpp index 43fd2ae60..61a701bf7 100644 --- a/src/nnfusion/core/kernels/hlsl/constant.cpp +++ b/src/nnfusion/core/kernels/hlsl/constant.cpp @@ -77,5 +77,5 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Constant", - Device(HLSL).TypeConstraint(DT_FLOAT).Tag("hlsl_kernel"), + Device(HLSL).TypeConstraint(element::f32).Tag("hlsl_kernel"), hlsl::Constant) \ No newline at end of file diff --git a/src/nnfusion/core/kernels/hlsl/parameter.cpp b/src/nnfusion/core/kernels/hlsl/parameter.cpp index 1d261fe32..2ed1d7740 100644 --- a/src/nnfusion/core/kernels/hlsl/parameter.cpp +++ b/src/nnfusion/core/kernels/hlsl/parameter.cpp @@ -56,5 +56,5 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Parameter", - Device(HLSL).TypeConstraint(DT_FLOAT).Tag("hlsl_kernel"), + Device(HLSL).TypeConstraint(element::f32).Tag("hlsl_kernel"), hlsl::Parameter) \ No newline at end of file diff --git a/src/nnfusion/core/kernels/hlsl/result.cpp b/src/nnfusion/core/kernels/hlsl/result.cpp index 6441da6eb..39f584e73 100644 --- a/src/nnfusion/core/kernels/hlsl/result.cpp +++ b/src/nnfusion/core/kernels/hlsl/result.cpp @@ -48,5 +48,5 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Result", - Device(HLSL).TypeConstraint(DT_FLOAT).Tag("hlsl_kernel"), + Device(HLSL).TypeConstraint(element::f32).Tag("hlsl_kernel"), hlsl::Result) \ No newline at end of file diff --git a/src/nnfusion/core/kernels/kernel_registration.cpp b/src/nnfusion/core/kernels/kernel_registration.cpp index ab6ce4d0b..f18d75b7d 100644 --- a/src/nnfusion/core/kernels/kernel_registration.cpp +++ b/src/nnfusion/core/kernels/kernel_registration.cpp @@ -3,6 +3,7 @@ #include "kernel_registration.hpp" #include "nnfusion/util/util.hpp" +#include "ngraph/src/nnfusion/common/type/element_type.hpp" using namespace nnfusion; using namespace nnfusion::kernels; @@ -18,7 +19,7 @@ KernelRegistration& KernelRegistration::Device(const NNFusion_DeviceType device_ return *this; } -KernelRegistration& KernelRegistration::TypeConstraint(const DataType data_type) +KernelRegistration& KernelRegistration::TypeConstraint(const element::Type data_type) { m_data_type = data_type; return *this; @@ -72,7 +73,7 @@ shared_ptr } shared_ptr KernelRegistry::FindKernelRegistration( - const string op_name, const NNFusion_DeviceType& device_type, const DataType data_type) + const string op_name, const NNFusion_DeviceType& device_type, const element::Type data_type) { std::vector> matched_regs; auto regs = m_kernel_registry.equal_range(op_name); @@ -97,7 +98,7 @@ shared_ptr KernelRegistry::FindKernelRegistration( } std::vector> KernelRegistry::FindKernelRegistrations( - const string op_name, const NNFusion_DeviceType& device_type, const DataType data_type) + const string op_name, const NNFusion_DeviceType& device_type, const element::Type data_type) { std::vector> matched_regs; auto regs = m_kernel_registry.equal_range(op_name); diff --git a/src/nnfusion/core/kernels/kernel_registration.hpp b/src/nnfusion/core/kernels/kernel_registration.hpp index 07c70d9c5..db19454aa 100644 --- a/src/nnfusion/core/kernels/kernel_registration.hpp +++ b/src/nnfusion/core/kernels/kernel_registration.hpp @@ -4,6 +4,7 @@ #pragma once #include "kernel_emitter.hpp" +#include "ngraph/src/nnfusion/common/type/element_type.hpp" namespace nnfusion { @@ -24,7 +25,7 @@ namespace nnfusion // Specify the data (inputs/outputs) types this kernel supports // Return *this - KernelRegistration& TypeConstraint(const DataType data_type); + KernelRegistration& TypeConstraint(const element::Type data_type); // Add an arbitrary user-defined tag on the kernel to allow the operator // to choose this kernel @@ -57,7 +58,7 @@ namespace nnfusion friend class KernelRegistry; string m_op_name; NNFusion_DeviceType m_device_type; - DataType m_data_type; + element::Type m_data_type; string m_tag; Factory m_factory; size_t m_priority = 0; @@ -79,11 +80,11 @@ namespace nnfusion shared_ptr FindKernelRegistration(const string op_name, const NNFusion_DeviceType& device_type, - const DataType data_type); + const element::Type data_type); std::vector> FindKernelRegistrations(const string op_name, const NNFusion_DeviceType& device_type, - const DataType data_type); + const element::Type data_type); shared_ptr KernelSelect(std::vector>& matched_regs); diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp index 65ec30522..581ed3e03 100755 --- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp +++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp @@ -838,6 +838,7 @@ void CudaCodegenPass::create_main_file(std::shared_ptr ctx, re_main->require(header::limits); re_main->require(header::cuda_prof_api); + re_main->require(header::cuda_fp16); re_main->require(macro::CUDA_SAFE_CALL); lu_main << "#include \"nnfusion_rt.h\"\n"; diff --git a/src/nnfusion/engine/pass/graph/kernel_selection.cpp b/src/nnfusion/engine/pass/graph/kernel_selection.cpp index be465f93b..fcd3148c7 100644 --- a/src/nnfusion/engine/pass/graph/kernel_selection.cpp +++ b/src/nnfusion/engine/pass/graph/kernel_selection.cpp @@ -436,7 +436,7 @@ bool DefaultKernelSelector::register_antares_kernel() op_name, Name(op_name) .Device(CUDA_GPU) - .TypeConstraint(DT_FLOAT) + .TypeConstraint(element::f32) .Tag("antares") .Priority(9) .KernelFactory([](shared_ptr context) -> shared_ptr { @@ -447,7 +447,7 @@ bool DefaultKernelSelector::register_antares_kernel() op_name, Name(op_name) .Device(GENERIC_CPU) - .TypeConstraint(DT_FLOAT) + .TypeConstraint(element::f32) .Tag("antares") .Priority(9) .KernelFactory([](shared_ptr context) -> shared_ptr { @@ -458,7 +458,7 @@ bool DefaultKernelSelector::register_antares_kernel() op_name, Name(op_name) .Device(HLSL) - .TypeConstraint(DT_FLOAT) + .TypeConstraint(element::f32) .Tag("antares") .Priority(9) .KernelFactory([](shared_ptr context) -> shared_ptr { diff --git a/src/nnfusion/frontend/tensorflow_import/ops/const.cpp b/src/nnfusion/frontend/tensorflow_import/ops/const.cpp index 9dba39c21..1d8f5e926 100644 --- a/src/nnfusion/frontend/tensorflow_import/ops/const.cpp +++ b/src/nnfusion/frontend/tensorflow_import/ops/const.cpp @@ -284,56 +284,6 @@ namespace nnfusion return true; } - /* - const std::map*)>, - const nnfusion::element::Type>>& - TF_NGRAPH_CONST_MAP() - { - static const std::map< - tensorflow::DataType, - std::pair*)>, - const nnfusion::element::Type>> - the_map = { - {tensorflow::DataType::DT_FLOAT, - std::make_pair(MakeConstOp, nnfusion::element::f32)}, - {tensorflow::DataType::DT_DOUBLE, - std::make_pair(MakeConstOp, nnfusion::element::f64)}, - {tensorflow::DataType::DT_INT8, - std::make_pair(MakeConstOp, nnfusion::element::i8)}, - {tensorflow::DataType::DT_INT16, - std::make_pair(MakeConstOp, nnfusion::element::i16)}, - // {tensorflow::DataType::DT_QINT8, - // std::make_pair(MakeConstOp, nnfusion::element::i8)}, - // {tensorflow::DataType::DT_QUINT16, - // std::make_pair(MakeConstOp, nnfusion::element::u8)}, - {tensorflow::DataType::DT_INT32, - std::make_pair(MakeConstOp, nnfusion::element::i32)}, - {tensorflow::DataType::DT_INT64, - std::make_pair(MakeConstOp, nnfusion::element::i64)}, - {tensorflow::DataType::DT_UINT8, - std::make_pair(MakeConstOp, nnfusion::element::u8)}, - {tensorflow::DataType::DT_UINT16, - std::make_pair(MakeConstOp, nnfusion::element::u16)}, - {tensorflow::DataType::DT_UINT32, - std::make_pair(MakeConstOp, nnfusion::element::u32)}, - {tensorflow::DataType::DT_UINT64, - std::make_pair(MakeConstOp, nnfusion::element::u64)}, - {tensorflow::DataType::DT_BOOL, - std::make_pair(MakeConstOp, nnfusion::element::boolean)}, - {tensorflow::DataType::DT_STRING, - std::make_pair(MakeConstOp, - nnfusion::element::character)}}; - // TODO: data type string unsupport now, bert model has string type const op used for assert - - return the_map; - } - */ - NamedNodeVector TranslateConstOp(const tensorflow::NodeDef& node, const NodeMap& all_ng_nodes, std::shared_ptr m_graph) @@ -363,20 +313,6 @@ namespace nnfusion return ret; } - - // const std::map TF_NGRAPH_CONST_MAP = { - // {tensorflow::DataType::DT_FLOAT, nnfusion::element::f32}, - // {tensorflow::DataType::DT_DOUBLE, nnfusion::element::f64}, - // {tensorflow::DataType::DT_INT8, nnfusion::element::i8}, - // {tensorflow::DataType::DT_INT16, nnfusion::element::i16}, - // {tensorflow::DataType::DT_INT32, nnfusion::element::i32}, - // {tensorflow::DataType::DT_INT64, nnfusion::element::i64}, - // {tensorflow::DataType::DT_UINT8, nnfusion::element::u8}, - // {tensorflow::DataType::DT_UINT16, nnfusion::element::u16}, - // {tensorflow::DataType::DT_UINT32, nnfusion::element::u32}, - // {tensorflow::DataType::DT_UINT64, nnfusion::element::u64}, - // {tensorflow::DataType::DT_BOOL, nnfusion::element::boolean}, - // {tensorflow::DataType::DT_STRING, nnfusion::element::character}}; } // namespace tensorflow_import } // namespace frontend } // namespace nnfusion diff --git a/test/main.cpp b/test/main.cpp index d7c57d838..5c8ddaca8 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -36,7 +36,7 @@ int main(int argc, char** argv) op_name, nnfusion::kernels::Name(op_name) .Device(CUDA_GPU) - .TypeConstraint(DT_FLOAT) + .TypeConstraint(element::f32) .Tag("antares") .Priority(9) .KernelFactory([](shared_ptr context) From 7215c91b2f5740d462a4da928a6a61e6a7b750e7 Mon Sep 17 00:00:00 2001 From: Cjkkkk Date: Thu, 26 Nov 2020 13:14:27 +0800 Subject: [PATCH 04/32] add pass info to log --- src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp | 2 +- .../engine/pass/graph/batchnorm_inference_folding_pass.cpp | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp index 973204f87..7cd431cc5 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp @@ -262,4 +262,4 @@ REGISTER_KERNEL_EMITTER( REGISTER_KERNEL_EMITTER( "Dot", // op_name Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cublas").Priority(2), // attrs - cuda::Dot) // constructor + cuda::Dot) // constructor \ No newline at end of file diff --git a/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp b/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp index ab1a671fe..1dc3b5521 100644 --- a/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp +++ b/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp @@ -886,6 +886,8 @@ bool BatchNormInferenceFoldingPass::run_on_graph(std::shared_ptrget_name(); for (auto pattern : BN_FOLDING_PATTERNS) { BatchNormInferenceOptimizer optimizer(graph, pattern); @@ -896,6 +898,8 @@ bool BatchNormInferenceFoldingPass::run_on_graph(std::shared_ptrget_name(); } return true; } \ No newline at end of file From 2e757837da2386dd169ac83eb5d9fda2aead1f9b Mon Sep 17 00:00:00 2001 From: Cjkkkk Date: Thu, 26 Nov 2020 13:41:10 +0800 Subject: [PATCH 05/32] update cudnn datatype mapping --- src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp | 10 +++++----- src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp | 8 ++++---- src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp | 2 +- src/nnfusion/engine/pass/graph/kernel_fusion_pass.cpp | 4 ++-- src/nnfusion/engine/pass/graph/kernel_selection.cpp | 8 ++++---- .../engine/pass/graph/runtime_const_folding_pass.cpp | 8 ++++---- src/nnfusion/engine/profiler/cpu_runtime.cpp | 2 +- src/nnfusion/engine/profiler/profiler.cpp | 2 +- src/nnfusion/frontend/util/evaluator.hpp | 6 +++--- test/nnfusion/engine/profiler.cpp | 2 +- test/nnfusion/kernels/sample.cpp | 2 +- test/nnfusion/test_util/common.hpp | 2 +- 12 files changed, 28 insertions(+), 28 deletions(-) diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp index b0ebd6c8b..f8891b025 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp @@ -17,7 +17,7 @@ std::vector cuda::compute_strides(const std::vector& shape) return strides; } -std::string cuda::get_cudnn_datatype(std::string dtype) +std::string cuda::get_cudnn_datatype(element::Type dtype) { static const std::unordered_map datatype_map{ {"half", "CUDNN_DATA_HALF"}, @@ -25,13 +25,13 @@ std::string cuda::get_cudnn_datatype(std::string dtype) {"double", "CUDNN_DATA_DOUBLE"}, {"int8_t", "CUDNN_DATA_INT8"}, {"int32_t", "CUDNN_DATA_INT32"}}; - auto p = datatype_map.find(dtype); + auto p = datatype_map.find(dtype.c_type_string()); NNFUSION_CHECK(p != datatype_map.end()) << dtype << " is not supported by cuDNN"; return p->second; } -LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape, string desc, string type) +LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape, string desc, element::Type type) { LanguageUnit_p _lu(new LanguageUnit); auto& lu = *_lu; @@ -92,7 +92,7 @@ LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& s return _lu; } -LanguageUnit_p cuda::get_cudnn_filter_descriptor(const Shape& shape, string desc, string type) +LanguageUnit_p cuda::get_cudnn_filter_descriptor(const Shape& shape, string desc, element::Type type) { LanguageUnit_p _lu(new LanguageUnit); auto& lu = *_lu; @@ -145,7 +145,7 @@ LanguageUnit_p cuda::get_cudnn_convolution_descriptor(const Shape& padding, const Strides& window_movement_strides, const Strides& window_dilation_strides, string desc, - string type) + element::Type type) { LanguageUnit_p _lu(new LanguageUnit); auto& lu = *_lu; diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp index 63d63ce66..fc8cdd8dc 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp @@ -13,16 +13,16 @@ namespace nnfusion namespace cuda { std::vector compute_strides(const std::vector& shape); - std::string get_cudnn_datatype(std::string dtype); + std::string get_cudnn_datatype(element::Type type); LanguageUnit_p cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape, string desc, - string type = "float"); + element::Type type = element::f32); LanguageUnit_p get_cudnn_convolution_descriptor(const Shape& padding, const Strides& window_movement_strides, const Strides& window_dilation_strides, string desc, - string type = "float"); - LanguageUnit_p get_cudnn_filter_descriptor(const Shape& shape, string desc, string type = "float"); + element::Type type = element::f32); + LanguageUnit_p get_cudnn_filter_descriptor(const Shape& shape, string desc, element::Type type = element::f32); LanguageUnit_p get_dropout_global_states(float ratio); inline std::string ratio2str(float ratio) { diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp index 581ed3e03..cf8e3d780 100755 --- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp +++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp @@ -345,7 +345,7 @@ std::vector>> else { auto kernel_reg = KernelRegistry::Global()->FindKernelRegistration( - "AnyOP", device_type(), DT_FLOAT); + "AnyOP", device_type(), element::f32); NNFUSION_CHECK(kernel_reg != nullptr) << "AnyOp Kernel not found, op=" << ins->getGNode()->get_op_type(); shared_ptr ctx(new KernelContext(ins->getGNode())); diff --git a/src/nnfusion/engine/pass/graph/kernel_fusion_pass.cpp b/src/nnfusion/engine/pass/graph/kernel_fusion_pass.cpp index 66a649307..f5112c3b6 100644 --- a/src/nnfusion/engine/pass/graph/kernel_fusion_pass.cpp +++ b/src/nnfusion/engine/pass/graph/kernel_fusion_pass.cpp @@ -649,12 +649,12 @@ class KernelFuseOptimizer if (n_device_type != GENERIC_CPU) { kernel_reg = KernelRegistry::Global()->FindKernelRegistration( - "ElementWiseFused", CUDA_GPU, DT_FLOAT); + "ElementWiseFused", CUDA_GPU, element::f32); } else { kernel_reg = KernelRegistry::Global()->FindKernelRegistration( - "ElementwiseFused", GENERIC_CPU, DT_FLOAT); + "ElementwiseFused", GENERIC_CPU, element::f32); } NNFUSION_CHECK_NOT_NULLPTR(kernel_reg); auto ctx = std::make_shared(); diff --git a/src/nnfusion/engine/pass/graph/kernel_selection.cpp b/src/nnfusion/engine/pass/graph/kernel_selection.cpp index fcd3148c7..7b212ee2f 100644 --- a/src/nnfusion/engine/pass/graph/kernel_selection.cpp +++ b/src/nnfusion/engine/pass/graph/kernel_selection.cpp @@ -26,7 +26,7 @@ pair IProfilingRuntime::Pointer runtime) { std::vector> kernel_regs = - KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, DT_FLOAT); + KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, element::f32); // Skip since only one candidate or constant if (kernel_regs.size() == 1 || gnode->is_constant()) @@ -143,12 +143,12 @@ pair { shared_ptr ctx(new KernelContext(gnode)); std::vector> kernel_regs = - KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, DT_FLOAT); + KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, element::f32); if (devtype == ROCM_GPU) { for (auto it : KernelRegistry::Global()->FindKernelRegistrations( - gnode->get_op_type(), CUDA_GPU, DT_FLOAT)) + gnode->get_op_type(), CUDA_GPU, element::f32)) kernel_regs.push_back(it); } @@ -355,7 +355,7 @@ pair NNFusion_DeviceType devtype) { std::vector> kernel_regs = - KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, DT_FLOAT); + KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, element::f32); shared_ptr ctx(new KernelContext(gnode)); std::vector functions; diff --git a/src/nnfusion/engine/pass/graph/runtime_const_folding_pass.cpp b/src/nnfusion/engine/pass/graph/runtime_const_folding_pass.cpp index 115da8c53..1dcb108e6 100644 --- a/src/nnfusion/engine/pass/graph/runtime_const_folding_pass.cpp +++ b/src/nnfusion/engine/pass/graph/runtime_const_folding_pass.cpp @@ -98,24 +98,24 @@ int RuntimeConstantFoldingPass::runtime_const_folding_iterate_once( runtime = nnfusion::profiler::RocmDefaultRuntime::Runtime(); NNFUSION_CHECK(runtime->check_env()); kernel_regs = KernelRegistry::Global()->FindKernelRegistrations( - it->get_op_type(), ROCM_GPU, DT_FLOAT); + it->get_op_type(), ROCM_GPU, element::f32); if (kernel_regs.size() == 0) kernel_regs = KernelRegistry::Global()->FindKernelRegistrations( - it->get_op_type(), CUDA_GPU, DT_FLOAT); + it->get_op_type(), CUDA_GPU, element::f32); } else if (backend == "CUDA") { runtime = nnfusion::profiler::CudaDefaultRuntime::Runtime(); NNFUSION_CHECK(runtime->check_env()); kernel_regs = KernelRegistry::Global()->FindKernelRegistrations( - it->get_op_type(), CUDA_GPU, DT_FLOAT); + it->get_op_type(), CUDA_GPU, element::f32); } else if (backend == "CPU") { runtime = nnfusion::profiler::ReferenceRuntime::Runtime(); NNFUSION_CHECK(runtime->check_env()); kernel_regs = KernelRegistry::Global()->FindKernelRegistrations( - it->get_op_type(), GENERIC_CPU, DT_FLOAT); + it->get_op_type(), GENERIC_CPU, element::f32); } else { diff --git a/src/nnfusion/engine/profiler/cpu_runtime.cpp b/src/nnfusion/engine/profiler/cpu_runtime.cpp index ff505f86f..a7e6ebd97 100644 --- a/src/nnfusion/engine/profiler/cpu_runtime.cpp +++ b/src/nnfusion/engine/profiler/cpu_runtime.cpp @@ -226,7 +226,7 @@ double ReferenceRuntime::invoke(const ProfilingContext::Pointer& ke, void** inpu auto& gnode = ke->kernel->m_context->gnode; std::vector> kernel_regs = KernelRegistry::Global()->FindKernelRegistrations( - gnode->get_op_type(), GENERIC_CPU, DT_FLOAT); + gnode->get_op_type(), GENERIC_CPU, element::f32); shared_ptr ctx(new KernelContext(gnode)); bool has_valid_kernel = false; diff --git a/src/nnfusion/engine/profiler/profiler.cpp b/src/nnfusion/engine/profiler/profiler.cpp index 469297f6c..174c3c108 100644 --- a/src/nnfusion/engine/profiler/profiler.cpp +++ b/src/nnfusion/engine/profiler/profiler.cpp @@ -82,7 +82,7 @@ void GraphEvaluate::create_profiling_contexts(shared_ptr gnode) return; } std::vector> kernel_regs = - KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), dev_type, DT_FLOAT); + KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), dev_type, element::f32); shared_ptr ctx(new KernelContext(gnode)); for (auto kernel_reg : kernel_regs) diff --git a/src/nnfusion/frontend/util/evaluator.hpp b/src/nnfusion/frontend/util/evaluator.hpp index 4ec3f6282..1e5a56e36 100644 --- a/src/nnfusion/frontend/util/evaluator.hpp +++ b/src/nnfusion/frontend/util/evaluator.hpp @@ -109,17 +109,17 @@ namespace nnfusion if (runtime->check_env()) { kernel_regs = KernelRegistry::Global()->FindKernelRegistrations( - gnode->get_op_type(), ROCM_GPU, DT_FLOAT); + gnode->get_op_type(), ROCM_GPU, element::f32); if (kernel_regs.size() == 0) kernel_regs = KernelRegistry::Global()->FindKernelRegistrations( - gnode->get_op_type(), CUDA_GPU, DT_FLOAT); + gnode->get_op_type(), CUDA_GPU, element::f32); } else { runtime = nnfusion::profiler::CudaDefaultRuntime::Runtime(); NNFUSION_CHECK(runtime->check_env()); kernel_regs = KernelRegistry::Global()->FindKernelRegistrations( - gnode->get_op_type(), CUDA_GPU, DT_FLOAT); + gnode->get_op_type(), CUDA_GPU, element::f32); } bool const_infer_success = false; diff --git a/test/nnfusion/engine/profiler.cpp b/test/nnfusion/engine/profiler.cpp index d83639ea2..cffc74d1c 100644 --- a/test/nnfusion/engine/profiler.cpp +++ b/test/nnfusion/engine/profiler.cpp @@ -25,7 +25,7 @@ TEST(nnfusion_engine_profiler, basic_utils) // Filter out the kernels meeting the requirement; std::vector> kernel_regs = - KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), CUDA_GPU, DT_FLOAT); + KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), CUDA_GPU, element::f32); shared_ptr ctx(new KernelContext(gnode)); // Gnerate Test data diff --git a/test/nnfusion/kernels/sample.cpp b/test/nnfusion/kernels/sample.cpp index da51ff135..367e6173c 100644 --- a/test/nnfusion/kernels/sample.cpp +++ b/test/nnfusion/kernels/sample.cpp @@ -24,7 +24,7 @@ TEST(nnfusion_core_kernels, sample) // Filter out the kernels meeting the requirement; std::vector> kernel_regs = - KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), CUDA_GPU, DT_FLOAT); + KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), CUDA_GPU, element::f32); shared_ptr ctx(new KernelContext(gnode)); EXPECT_GT(kernel_regs.size(), 0); diff --git a/test/nnfusion/test_util/common.hpp b/test/nnfusion/test_util/common.hpp index 50bf4dd99..a020b3183 100644 --- a/test/nnfusion/test_util/common.hpp +++ b/test/nnfusion/test_util/common.hpp @@ -74,7 +74,7 @@ namespace nnfusion } std::vector> available_kernels = KernelRegistry::Global()->FindKernelRegistrations( - gnode->get_op_type(), dev_t, DT_FLOAT); + gnode->get_op_type(), dev_t, element::f32); shared_ptr ctx(new KernelContext(gnode)); bool kernel_found = false; for (auto& kernel_reg : available_kernels) From 0ac891fe1f42bf307da0b4c7eae396661c5b0c4b Mon Sep 17 00:00:00 2001 From: Cjkkkk Date: Thu, 26 Nov 2020 13:54:42 +0800 Subject: [PATCH 06/32] add fp16 header to header file --- src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp | 2 +- src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp index 62f1bf6cc..9ced8d459 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp @@ -14,7 +14,7 @@ LU_DEFINE(header::cudnn, "#include \n"); LU_DEFINE(header::super_scaler, "#include \"super_scaler.h\"\n"); LU_DEFINE(header::cupti, "#include \n"); LU_DEFINE(header::cuda_prof_api, "#include \n"); -LU_DEFINE(header::cuda_fp16, "#include "); +LU_DEFINE(header::cuda_fp16, "#include \n"); // Macro LU_DEFINE( diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp index cf8e3d780..fd1c3b456 100755 --- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp +++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp @@ -803,6 +803,8 @@ void CudaCodegenPass::create_header_file(std::shared_ptr ctx lu_header << declaration::typedef_int->get_code() << "\n"; if (device_type() == CUDA_GPU || device_type() == ROCM_GPU) lu_header << header::cuda->get_code(); + // TODO only include this if half is used + lu_header << header::cuda_fp16->get_code(); lu_header << "extern \"C\" int kernel_entry("; std::string params = get_kernel_entry_paras(tu); From 1e38b5922a9d6b0e844c9332ac304d1369069236 Mon Sep 17 00:00:00 2001 From: Niupple Date: Fri, 27 Nov 2020 09:00:51 +0000 Subject: [PATCH 07/32] vgg11 runnable --- .../core/kernels/cuda_gpu/cuda_langunit.cpp | 20 ++ .../core/kernels/cuda_gpu/cuda_langunit.hpp | 1 + .../core/kernels/cuda_gpu/kernels/dot.cpp | 216 ++++++++++++------ .../kernels/cuda_gpu/kernels/max_pool.cpp | 21 +- .../kernels/cuda_gpu/kernels/max_pool.hpp | 4 +- .../pass/graph/codegen_dxcompute_pass.hpp | 14 +- .../pass/graph/codegen_graphcore_pass.hpp | 14 +- test/nnfusion/kernels/batch_test.cpp | 162 ++++++------- .../ngraph/src/nnfusion/common/type_info.cpp | 1 + .../ngraph/src/nnfusion/common/type_info.hpp | 28 +-- 10 files changed, 292 insertions(+), 189 deletions(-) diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp index 9ced8d459..565c18c49 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp @@ -253,6 +253,26 @@ __device__ __forceinline__ int64_t load(const int64_t* __restrict__ in, int i= } )"); +LU_DEFINE( + declaration::cuda_fp16_scale, + R"( +__global__ void nnfusionHalfScaleKernel(half *x, half *alpha, size_t count) +{ + size_t offset = threadIdx.x + blockIdx.x * blockDim.x; + x += offset; + if (offset < count) + { + *x *= *alpha; + } +} + +void nnfusionHalfScale(half *x, half *alpha, size_t len) +{ + nnfusionHalfScaleKernel<<<(len+255)/256, 256>>>(x, alpha, len); +} + )" +) + LU_DEFINE_EXTEND(declaration::cuda_reduce_primitive, R"( #if CUDA_VERSION < 9000 diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp index ecaeb6c89..93dbc0243 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp @@ -43,6 +43,7 @@ namespace nnfusion LU_DECLARE(num_SMs); LU_DECLARE(cuda_reduce_primitive); LU_DECLARE(cuda_layer_norm); + LU_DECLARE(cuda_fp16_scale); } // namespace declaration } // namespace kernels } // namespace nnfusion diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp index 78c09b6b3..2f07207c3 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp @@ -204,83 +204,163 @@ LanguageUnit_p cuda::Dot::emit_function_body() << " " << n << "));\n"; } } else if (dtype == element::f16) { - size_t axes_for_m_count = arg0_shape.size() - reduction_axes; - size_t axes_for_n_count = arg1_shape.size() - reduction_axes; - size_t axes_for_k_count = reduction_axes; - size_t m = 1; - size_t n = 1; - size_t k = 1; - - // check if input and output size correct - // check and calculate k for arg0 and arg1 - size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k - size_t arg1_k_idx = 0; // first axe in arg1 for k - - for (size_t i = 0; i < axes_for_k_count; i++) - { - k *= arg0_shape[arg0_k_idx]; - if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++]) + // case 1: Scalar * Tensor + // if (arg0_shape.empty() || arg1_shape.empty()) + // { + // auto& second = (arg0_shape.empty() ? arg1_shape : arg0_shape); + // size_t count = nnfusion::shape_size(second); + + // string firstarg = (arg0_shape.empty() ? "input1" : "input0"); + // string secondarg = (arg0_shape.empty() ? "input0" : "input1"); + + // lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n"; + + // lu << "CUDA_SAFE_CALL(cudaMemcpy(outupt0, " << firstarg << ", " << count << ", cudaMemcpyDeviceToDevice));\n"; // copy `firstarg` to `output0` + // lu << "CUBLAS_SAFE_CALL(nnfusionHalfScale(" << secondarg << ", output0, " << count << "));\n"; + // } + // // case 2: 1d Dot + // else if ((arg0_shape.size() == arg1_shape.size()) && (arg0_shape.size() == reduction_axes)) + // { + // for (int i = 0; i < arg0_shape.size(); i++) + // { + // if (arg0_shape[i] != arg1_shape[i]) + // { + // std::vector arg_vec{"arg0", "arg1"}; + // std::vector shape_vec{arg0_shape, arg1_shape}; + + // NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + // << nnfusion::join(shape_vec) << " respectively, at Node " + // << m_context->gnode->get_name() + // << ", do not match for dot op"; + // } + // } + + // size_t count = nnfusion::shape_size(arg0_shape); + // lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n"; + + // lu << "CUBLAS_SAFE_CALL(cublasSdot(cublas_handle, " << count + // << ", static_cast(input0), 1, static_cast(input1), 1, " + // "static_cast(output0)));\n"; + // } + // // matrix * vector + // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1)) + // { + // lu << "const float alpha = 1.0;\n const float beta = 0;\n"; + // lu << "CUBLAS_SAFE_CALL(cublasSgemv(cublas_handle, "; + // if (trans_A) + // lu << "CUBLAS_OP_N, " << arg0_shape[0] << ", " << arg0_shape[1] << ", "; + // else + // lu << "CUBLAS_OP_T, " << arg0_shape[1] << ", " << arg0_shape[0] << ", "; + // lu << " &alpha," + // << " static_cast(input0)," << arg0_shape[1] << ", " + // << " static_cast(input1)," + // << " 1," + // << " &beta," + // << " static_cast(output0)," + // << " 1));\n"; + // } + // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) && + // (trans_A || trans_B)) + // { + // int m = trans_B ? arg1_shape[0] : arg1_shape[1]; + // int n = trans_A ? arg0_shape[1] : arg0_shape[0]; + // int k = trans_A ? arg0_shape[0] : arg0_shape[1]; + + // lu << "const half alpha = 1.0;\nconst half beta = 0;\n"; + + // lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," + // << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") + // << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << "," + // << " " << n << "," + // << " " << k << "," + // << " &alpha," + // << " static_cast(input1)," + // << " " << arg1_shape[1] << "," + // << " static_cast(input0)," + // << " " << arg0_shape[1] << "," + // << " &beta," + // << " static_cast(output0)," + // << " " << m << "));\n"; + // } else { + size_t axes_for_m_count = arg0_shape.size() - reduction_axes; + size_t axes_for_n_count = arg1_shape.size() - reduction_axes; + size_t axes_for_k_count = reduction_axes; + size_t m = 1; + size_t n = 1; + size_t k = 1; + + // check if input and output size correct + // check and calculate k for arg0 and arg1 + size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k + size_t arg1_k_idx = 0; // first axe in arg1 for k + + for (size_t i = 0; i < axes_for_k_count; i++) { - std::vector arg_vec{"arg0", "arg1"}; - std::vector shape_vec{arg0_shape, arg1_shape}; + k *= arg0_shape[arg0_k_idx]; + if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++]) + { + std::vector arg_vec{"arg0", "arg1"}; + std::vector shape_vec{arg0_shape, arg1_shape}; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } } - } - // check and calculate m for arg0 and out - size_t arg0_m_idx = 0; // first axe in arg0 for m - size_t out_m_idx = 0; // first axe in out for m - for (size_t i = 0; i < axes_for_m_count; i++) - { - m *= arg0_shape[arg0_m_idx]; - if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++]) + // check and calculate m for arg0 and out + size_t arg0_m_idx = 0; // first axe in arg0 for m + size_t out_m_idx = 0; // first axe in out for m + for (size_t i = 0; i < axes_for_m_count; i++) { - std::vector arg_vec{"arg0", "output"}; - std::vector shape_vec{arg0_shape, out_shape}; + m *= arg0_shape[arg0_m_idx]; + if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++]) + { + std::vector arg_vec{"arg0", "output"}; + std::vector shape_vec{arg0_shape, out_shape}; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } } - } - // check and calculate n for arg1 and out - size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n - size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n - for (size_t i = 0; i < axes_for_n_count; i++) - { - n *= arg1_shape[arg1_n_idx]; - if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++]) + // check and calculate n for arg1 and out + size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n + size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n + for (size_t i = 0; i < axes_for_n_count; i++) { - std::vector arg_vec{"arg1", "output"}; - std::vector shape_vec{arg1_shape, out_shape}; + n *= arg1_shape[arg1_n_idx]; + if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++]) + { + std::vector arg_vec{"arg1", "output"}; + std::vector shape_vec{arg1_shape, out_shape}; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } } - } - lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; - - lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," - << " CUBLAS_OP_N," - << " CUBLAS_OP_N," - << " " << n << "," - << " " << m << "," - << " " << k << "," - << " &alpha," - << " static_cast(input1)," - << " " << n << "," - << " static_cast(input0)," - << " " << k << "," - << " &beta," - << " static_cast(output0)," - << " " << n << "));\n"; + lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; + + lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," + << " CUBLAS_OP_N," + << " CUBLAS_OP_N," + << " " << n << "," + << " " << m << "," + << " " << k << "," + << " &alpha," + << " static_cast(input1)," + << " " << n << "," + << " static_cast(input0)," + << " " << k << "," + << " &beta," + << " static_cast(output0)," + << " " << n << "));\n"; + // } + } else { NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot."; } @@ -296,6 +376,8 @@ LanguageUnit_p cuda::Dot::emit_dependency() _lu->require(header::stdexcept); _lu->require(header::sstream); _lu->require(macro::CUBLAS_SAFE_CALL); + _lu->require(macro::CUDA_SAFE_CALL); + _lu->require(declaration::cuda_fp16_scale); //_lu->require(declaration::cublas_handle); return _lu; } diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp index ed603d218..76af4bb3e 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp @@ -23,15 +23,15 @@ cuda::MaxPool1D::MaxPool1D(shared_ptr ctx) input_width = input_shape.back(); output_width = output_shape.back(); - input_type = ctx->inputs[0]->get_element_type().c_type_string(); - output_type = ctx->outputs[0]->get_element_type().c_type_string(); + input_type = ctx->inputs[0]->get_element_type(); + output_type = ctx->outputs[0]->get_element_type(); // NNFUSION_CHECK(input_shape.size() == 3) // << "Input shape size of MaxPool1D is invalid, shape size: " << input_shape.size() // << "expected 3"; std::stringstream tag; - tag << "cuda_maxpool_" << input_type << "_" << output_type << "_iw" + tag << "cuda_maxpool_" << input_type.c_type_string() << "_" << output_type.c_type_string() << "_iw" << std::to_string(input_width) << "_ow" << std::to_string(output_width) << "_ww" << std::to_string(window_width) << "_wst" << std::to_string(window_stride_width); custom_tag = tag.str(); @@ -53,11 +53,11 @@ LanguageUnit_p cuda::MaxPool1D::emit_function_body() // Index into input tensor. lu << "size_t start = (tid / " << output_width << ") * " << input_width << " + " << " (tid % " << output_width << ") * " << window_stride[0] << ";\n"; - lu << input_type << " max_val = " << TypeInfo::Get(input_type)->lowest() << ";\n"; + lu << input_type.c_type_string() << " max_val = " << TypeInfo::Get(input_type)->lowest() << ";\n"; lu << "for (size_t i = start; i < start + " << window_width << "; i++)\n"; lu.block_begin(); { - lu << "const " << input_type << " input = input0[i];\n"; + lu << "const " << input_type.c_type_string() << " input = input0[i];\n"; lu << "if (input > max_val)\n"; lu.block_begin(); { @@ -98,6 +98,8 @@ cuda::MaxPoolmD::MaxPoolmD(shared_ptr ctx) : CudaLibEmitter(ctx) { auto max_pool = static_pointer_cast(ctx->gnode->get_op_ptr()); + input_type = ctx->inputs[0]->get_element_type(); + output_type = ctx->outputs[0]->get_element_type(); input_shape = nnfusion::Shape(ctx->inputs[0]->get_shape()); output_shape = nnfusion::Shape(ctx->outputs[0]->get_shape()); window_shape = nnfusion::Shape(max_pool->get_window_shape()); @@ -105,11 +107,8 @@ cuda::MaxPoolmD::MaxPoolmD(shared_ptr ctx) padding_above = nnfusion::Shape(max_pool->get_padding_above()); window_stride = nnfusion::Strides(max_pool->get_window_movement_strides()); - input_type = ctx->inputs[0]->get_element_type().c_type_string(); - output_type = ctx->outputs[0]->get_element_type().c_type_string(); - std::stringstream tag; - tag << "cudnn_maxpool_dtype_" << output_type << "_i" << join(input_shape, "_") << "_o" + tag << "cudnn_maxpool_dtype_" << output_type.c_type_string() << "_i" << join(input_shape, "_") << "_o" << join(output_shape, "_") << "_ws" << join(window_shape, "_") << "_wst" << join(window_stride, "_") << "_pb" << join(padding_below, "_") << "_pb" << join(padding_above, "_"); @@ -124,8 +123,8 @@ LanguageUnit_p cuda::MaxPoolmD::emit_function_body() LanguageUnit_p _lu(new LanguageUnit(get_function_name())); auto& lu = *_lu; - auto input_desc = cudnn_tensor_descriptor_from_shape(input_shape, "input_desc"); - auto output_desc = cudnn_tensor_descriptor_from_shape(output_shape, "output_desc"); + auto input_desc = cudnn_tensor_descriptor_from_shape(input_shape, "input_desc", input_type); + auto output_desc = cudnn_tensor_descriptor_from_shape(output_shape, "output_desc", output_type); lu << input_desc->get_code(); lu << output_desc->get_code(); diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.hpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.hpp index 3a14bdf1d..1be7a15af 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.hpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.hpp @@ -24,8 +24,8 @@ namespace nnfusion shared_ptr kernel_ctx; nnfusion::Shape input_shape, output_shape, window_shape, padding_below, padding_above; + element::Type input_type, output_type; nnfusion::Strides window_stride; - string input_type, output_type; size_t window_width, window_stride_width, input_width, output_width; }; @@ -41,10 +41,10 @@ namespace nnfusion bool require_cudnn_handle() override { return true; } private: shared_ptr kernel_ctx; + element::Type input_type, output_type; nnfusion::Shape input_shape, output_shape, window_shape, padding_below, padding_above; nnfusion::Strides window_stride; - string input_type, output_type; }; } // namespace cuda } // namespace kernels diff --git a/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp b/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp index 35b80d1cd..1779ad827 100644 --- a/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp +++ b/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp @@ -66,13 +66,13 @@ namespace nnfusion return result.str(); } - inline int get_type_id(nnfusion::element::Type type) - { - // TODO: fill more type cases - if (type == nnfusion::element::f32) - return DT_FLOAT; - throw std::runtime_error("Not supported element type."); - } + // inline int get_type_id(nnfusion::element::Type type) + // { + // // TODO: fill more type cases + // if (type == nnfusion::element::f32) + // return DT_FLOAT; + // throw std::runtime_error("Not supported element type."); + // } template inline std::shared_ptr get_op_object(std::shared_ptr& curr) diff --git a/src/nnfusion/engine/pass/graph/codegen_graphcore_pass.hpp b/src/nnfusion/engine/pass/graph/codegen_graphcore_pass.hpp index bd272c0bf..45a8b389d 100644 --- a/src/nnfusion/engine/pass/graph/codegen_graphcore_pass.hpp +++ b/src/nnfusion/engine/pass/graph/codegen_graphcore_pass.hpp @@ -66,13 +66,13 @@ namespace nnfusion return result.str(); } - inline int get_type_id(nnfusion::element::Type type) - { - // TODO: fill more type cases - if (type == nnfusion::element::f32) - return DT_FLOAT; - throw std::runtime_error("Not supported element type."); - } + // inline int get_type_id(nnfusion::element::Type type) + // { + // // TODO: fill more type cases + // if (type == nnfusion::element::f32) + // return DT_FLOAT; + // throw std::runtime_error("Not supported element type."); + // } template inline std::shared_ptr get_op_object(std::shared_ptr& curr) diff --git a/test/nnfusion/kernels/batch_test.cpp b/test/nnfusion/kernels/batch_test.cpp index 024bc4257..00bb1d983 100644 --- a/test/nnfusion/kernels/batch_test.cpp +++ b/test/nnfusion/kernels/batch_test.cpp @@ -66,7 +66,7 @@ namespace nnfusion ///\todo Maybe a better/general way template - bool check_kernels(NNFusion_DeviceType dev_t, DataType data_t) + bool check_kernels(NNFusion_DeviceType dev_t, element::Type data_t) { for (int case_id = 0;; case_id++) { @@ -92,255 +92,255 @@ namespace nnfusion ///param: node, device_type, data_type ... etc TEST(nnfusion_core_kernels, batch_kernel_tests_abs) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_add) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } /* TODO: arg type is bool, enable if bool data type is supported, the test case data type should also be modified TEST(nnfusion_core_kernels, batch_kernel_tests_and) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } */ /* TODO: arg index type is i32/i64, enable if more data type is supported, the test case data type should also be modified TEST(nnfusion_core_kernels, batch_kernel_tests_arg_max) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_arg_min) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } */ TEST(nnfusion_core_kernels, batch_kernel_tests_broadcast) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_ceiling) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_concat) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } /* TODO: enable if more data type is supported, the test case data type should also be modified TEST(nnfusion_core_kernels, batch_kernel_tests_convert) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } */ TEST(nnfusion_core_kernels, batch_kernel_tests_divide) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_dot) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } /* TODO: return type is bool, enable if bool data type is supported, the test case data type should also be modified TEST(nnfusion_core_kernels, batch_kernel_tests_equal) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } */ TEST(nnfusion_core_kernels, batch_kernel_tests_floor) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } /* TODO: return type is bool, enable if bool data type is supported, the test case data type should also be modified TEST(nnfusion_core_kernels, batch_kernel_tests_greater) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_greater_eq) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_less) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_less_eq) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } */ TEST(nnfusion_core_kernels, batch_kernel_tests_max) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_max_pool) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_maximum) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_min) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_minimum) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_multiply) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_negative) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } /* TODO: enable if more data type is supported, the test case data type should also be modified TEST(nnfusion_core_kernels, batch_kernel_tests_not) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } */ /* TODO: return type is bool, enable if bool data type is supported, the test case data type should also be modified TEST(nnfusion_core_kernels, batch_kernel_tests_not_equal) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } */ /* TODO: enable if bool data type is supported, the test case data type should also be modified TEST(nnfusion_core_kernels, batch_kernel_tests_or) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } */ TEST(nnfusion_core_kernels, batch_kernel_tests_pad) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_product) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_relu) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_relu_backprop) { // TODO: there is no cpu kernel implemented - // EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + // EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } /* TODO: there is no replace slice kernel implemented TEST(nnfusion_core_kernels, batch_kernel_tests_replace_slice) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } */ TEST(nnfusion_core_kernels, batch_kernel_tests_reshape) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_reverse) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } /* TODO: enable if bool data type is supported, the test case data type should also be modified TEST(nnfusion_core_kernels, batch_kernel_tests_select) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } */ TEST(nnfusion_core_kernels, batch_kernel_tests_sign) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_slice) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_sqrt) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_subtract) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } TEST(nnfusion_core_kernels, batch_kernel_tests_sum) { - EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, DT_FLOAT)); - EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, DT_FLOAT)); + EXPECT_TRUE(nnfusion::test::check_kernels(GENERIC_CPU, element::f32)); + EXPECT_TRUE(nnfusion::test::check_kernels(CUDA_GPU, element::f32)); } \ No newline at end of file diff --git a/thirdparty/ngraph/src/nnfusion/common/type_info.cpp b/thirdparty/ngraph/src/nnfusion/common/type_info.cpp index 12d730860..d52c14193 100644 --- a/thirdparty/ngraph/src/nnfusion/common/type_info.cpp +++ b/thirdparty/ngraph/src/nnfusion/common/type_info.cpp @@ -18,6 +18,7 @@ const nnfusion::TypeInfo::TypeDispatch nnfusion::TypeInfo::dispatcher{ {"char", std::make_shared>()}, + {"half", std::make_shared>()}, {"float", std::make_shared>()}, {"double", std::make_shared>()}, {"int8_t", std::make_shared>()}, diff --git a/thirdparty/ngraph/src/nnfusion/common/type_info.hpp b/thirdparty/ngraph/src/nnfusion/common/type_info.hpp index 7c4a1c079..ff3858751 100644 --- a/thirdparty/ngraph/src/nnfusion/common/type_info.hpp +++ b/thirdparty/ngraph/src/nnfusion/common/type_info.hpp @@ -72,18 +72,18 @@ namespace nnfusion std::string max() const override { return to_string(std::numeric_limits::max()); } }; - enum DataType - { - DT_FLOAT, - DT_DOUBLE, - DT_INT8, - DT_INT16, - DT_INT32, - DT_INT64, - DT_UINT8, - DT_UINT16, - DT_UINT32, - DT_UINT64, - DT_CHAR, - }; + // enum DataType + // { + // DT_FLOAT, + // DT_DOUBLE, + // DT_INT8, + // DT_INT16, + // DT_INT32, + // DT_INT64, + // DT_UINT8, + // DT_UINT16, + // DT_UINT32, + // DT_UINT64, + // DT_CHAR, + // }; } \ No newline at end of file From c91a70c682c3ed70ae7b63b443f9333a4c4311df Mon Sep 17 00:00:00 2001 From: Niupple Date: Fri, 27 Nov 2020 09:28:11 +0000 Subject: [PATCH 08/32] code sytle applied --- .../core/kernels/cpu/eigen/concat.cpp | 2 +- .../core/kernels/cpu/eigen/convolution.cpp | 2 +- src/nnfusion/core/kernels/cpu/eigen/dot.cpp | 2 +- .../core/kernels/cpu/eigen/elementwise.cpp | 7 +- src/nnfusion/core/kernels/cpu/eigen/lstm.cpp | 2 +- .../core/kernels/cpu/eigen/max_pool.cpp | 2 +- src/nnfusion/core/kernels/cpu/eigen/pad.cpp | 4 +- .../core/kernels/cpu/eigen/reduce.cpp | 7 +- .../core/kernels/cpu/eigen/softmax.cpp | 4 +- .../core/kernels/cpu/general/anyop.cpp | 4 +- .../core/kernels/cpu/general/reshape.cpp | 7 +- src/nnfusion/core/kernels/cpu/mkl/dot.cpp | 2 +- .../core/kernels/cpu/mlas/avg_pool.cpp | 4 +- .../core/kernels/cpu/mlas/batch_matmul.cpp | 2 +- .../core/kernels/cpu/mlas/convolution.cpp | 4 +- src/nnfusion/core/kernels/cpu/mlas/dot.cpp | 2 +- .../core/kernels/cpu/mlas/max_pool.cpp | 4 +- .../kernels/cpu/reference/batch_matmul.cpp | 4 +- .../core/kernels/cpu/reference/constant.cpp | 4 +- .../core/kernels/cpu/reference/kernels.cpp | 186 +++++++++--------- .../core/kernels/cpu/reference/one_hot.cpp | 4 +- .../core/kernels/cpu/reference/reduce_all.cpp | 4 +- .../kernels/cpu/reference/stop_gradient.cpp | 4 +- .../core/kernels/cpu/reference/transpose.cpp | 4 +- .../core/kernels/cpu/reference/variable.cpp | 4 +- .../core/kernels/cpu/simd/elementwise.cpp | 7 +- .../kernels/cpu/simd/elementwise_fused.cpp | 2 +- .../core/kernels/cuda_gpu/cuda_cudnn.cpp | 7 +- .../core/kernels/cuda_gpu/cuda_cudnn.hpp | 6 +- .../core/kernels/cuda_gpu/cuda_langunit.cpp | 8 +- .../cuda_gpu/inl/generate_kernel_code-inl.hpp | 4 +- .../core/kernels/cuda_gpu/kernels/addn.cpp | 7 +- .../kernels/cuda_gpu/kernels/allreduce.cpp | 4 +- .../core/kernels/cuda_gpu/kernels/anyop.cpp | 4 +- .../kernels/cuda_gpu/kernels/apply_adam.cpp | 7 +- .../kernels/apply_gradient_descent.cpp | 14 +- .../cuda_gpu/kernels/apply_momentum.cpp | 7 +- .../core/kernels/cuda_gpu/kernels/assign.cpp | 7 +- .../kernels/cuda_gpu/kernels/assign_sub.cpp | 7 +- .../kernels/cuda_gpu/kernels/avg_pool.cpp | 24 +-- .../kernels/cuda_gpu/kernels/avg_pool.hpp | 4 +- .../kernels/cuda_gpu/kernels/batch_matmul.cpp | 8 +- .../kernels/cuda_gpu/kernels/batch_norm.cpp | 23 ++- .../cuda_gpu/kernels/blockfusion_fused.cpp | 2 +- .../kernels/cuda_gpu/kernels/broadcast.cpp | 8 +- .../core/kernels/cuda_gpu/kernels/concat.cpp | 8 +- .../cuda_gpu/kernels/concat_offset.cpp | 4 +- .../kernels/cuda_gpu/kernels/constant.cpp | 4 +- .../kernels/cuda_gpu/kernels/convolution.cpp | 28 +-- .../cuda_gpu/kernels/depthwise_conv2d.cpp | 4 +- .../core/kernels/cuda_gpu/kernels/dot.cpp | 155 ++++++++------- .../core/kernels/cuda_gpu/kernels/dropout.cpp | 8 +- .../cuda_gpu/kernels/dynamic_stitch.cpp | 4 +- .../kernels/cuda_gpu/kernels/elementwise.cpp | 2 +- .../cuda_gpu/kernels/elementwise_fused.cpp | 2 +- .../kernels/cuda_gpu/kernels/gather_1d.cpp | 8 +- .../kernels/cuda_gpu/kernels/gather_nd.cpp | 8 +- .../cuda_gpu/kernels/invert_permutation.cpp | 5 +- .../kernels/cuda_gpu/kernels/layer_norm.cpp | 4 +- .../kernels/cuda_gpu/kernels/max_pool.cpp | 19 +- .../core/kernels/cuda_gpu/kernels/one_hot.cpp | 4 +- .../core/kernels/cuda_gpu/kernels/pad.cpp | 4 +- .../core/kernels/cuda_gpu/kernels/range.cpp | 4 +- .../core/kernels/cuda_gpu/kernels/reduce.cpp | 54 ++--- .../kernels/cuda_gpu/kernels/reduce_all.cpp | 4 +- .../core/kernels/cuda_gpu/kernels/reshape.cpp | 16 +- .../core/kernels/cuda_gpu/kernels/result.cpp | 4 +- .../core/kernels/cuda_gpu/kernels/reverse.cpp | 4 +- .../cuda_gpu/kernels/reverse_sequence.cpp | 8 +- .../kernels/rocm/batch_gemm_fixed.cpp | 4 +- .../cuda_gpu/kernels/rocm/broadcast_host.cpp | 4 +- .../kernels/rocm/broadcast_kernel.cpp | 4 +- .../cuda_gpu/kernels/rocm/convfwd_fixed.cpp | 4 +- .../cuda_gpu/kernels/rocm/convolution.cpp | 4 +- .../cuda_gpu/kernels/rocm/gemm_fixed.cpp | 4 +- .../cuda_gpu/kernels/rocm/reduce_sum.cpp | 2 +- .../kernels/cuda_gpu/kernels/rocm/softmax.cpp | 4 +- .../core/kernels/cuda_gpu/kernels/scatter.cpp | 7 +- .../core/kernels/cuda_gpu/kernels/slice.cpp | 4 +- .../core/kernels/cuda_gpu/kernels/softmax.cpp | 6 +- .../kernels/sparse_apply_momentum.cpp | 7 +- .../cuda_gpu/kernels/stop_gradient.cpp | 4 +- .../cuda_gpu/kernels/strided_slice_grad.cpp | 4 +- .../core/kernels/cuda_gpu/kernels/tile.cpp | 11 +- .../kernels/cuda_gpu/kernels/transpose.cpp | 4 +- .../cuda_gpu/kernels/unsorted_segment_sum.cpp | 7 +- .../kernels/cuda_gpu/kernels/variable.cpp | 4 +- .../core/kernels/cuda_gpu/kernels/zeros.cpp | 4 +- .../core/kernels/kernel_registration.cpp | 2 +- .../core/operators/generic_op/generic_op.hpp | 2 +- .../engine/pass/codegen/cuda_codegen_pass.cpp | 4 +- .../batchnorm_inference_folding_pass.cpp | 4 +- .../engine/pass/graph/kernel_selection.cpp | 9 +- src/nnfusion/engine/profiler/profiler.cpp | 3 +- .../frontend/tensorflow_import/ops/const.cpp | 60 ++++-- test/nnfusion/engine/profiler.cpp | 3 +- test/nnfusion/kernels/sample.cpp | 3 +- 97 files changed, 523 insertions(+), 459 deletions(-) mode change 100755 => 100644 src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp diff --git a/src/nnfusion/core/kernels/cpu/eigen/concat.cpp b/src/nnfusion/core/kernels/cpu/eigen/concat.cpp index f7e3df997..435cc262a 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/concat.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/concat.cpp @@ -209,6 +209,6 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( - "Concat", // op_name + "Concat", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs cpu::ConcatEigen) diff --git a/src/nnfusion/core/kernels/cpu/eigen/convolution.cpp b/src/nnfusion/core/kernels/cpu/eigen/convolution.cpp index 8bda08757..dec874ce2 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/convolution.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/convolution.cpp @@ -143,6 +143,6 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( - "Convolution", // op_name + "Convolution", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs cpu::ConvolutionEigen) diff --git a/src/nnfusion/core/kernels/cpu/eigen/dot.cpp b/src/nnfusion/core/kernels/cpu/eigen/dot.cpp index 4a04d623b..9a1ae81e1 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/dot.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/dot.cpp @@ -89,6 +89,6 @@ LanguageUnit_p cpu::Dot::emit_dependency() } REGISTER_KERNEL_EMITTER( - "Dot", // op_name + "Dot", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs cpu::Dot) diff --git a/src/nnfusion/core/kernels/cpu/eigen/elementwise.cpp b/src/nnfusion/core/kernels/cpu/eigen/elementwise.cpp index bfc79215f..9418aeb5d 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/elementwise.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/elementwise.cpp @@ -7,9 +7,10 @@ using namespace nnfusion; using namespace nnfusion::kernels; #define REGISTER_EW_KERNEL(OP_NAME) \ - REGISTER_KERNEL_EMITTER("" #OP_NAME "", \ - Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), \ - cpu::ElementwiseEigen); + REGISTER_KERNEL_EMITTER( \ + "" #OP_NAME "", \ + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), \ + cpu::ElementwiseEigen); REGISTER_EW_KERNEL(Abs) REGISTER_EW_KERNEL(Acos) diff --git a/src/nnfusion/core/kernels/cpu/eigen/lstm.cpp b/src/nnfusion/core/kernels/cpu/eigen/lstm.cpp index 8c3ed6046..2675a6cde 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/lstm.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/lstm.cpp @@ -144,7 +144,7 @@ LanguageUnit_p cpu::LstmEigen::emit_dependency() } REGISTER_KERNEL_EMITTER( - "Lstm", // op_name + "Lstm", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs cpu::LstmEigen) diff --git a/src/nnfusion/core/kernels/cpu/eigen/max_pool.cpp b/src/nnfusion/core/kernels/cpu/eigen/max_pool.cpp index cc55948d7..cd1959a44 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/max_pool.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/max_pool.cpp @@ -168,6 +168,6 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( - "MaxPool", // op_name + "MaxPool", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs cpu::MaxPoolEigen) diff --git a/src/nnfusion/core/kernels/cpu/eigen/pad.cpp b/src/nnfusion/core/kernels/cpu/eigen/pad.cpp index aa82543a8..3010aa488 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/pad.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/pad.cpp @@ -7,6 +7,6 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( - "Pad", // op_name + "Pad", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs - cpu::Pad) // constructor + cpu::Pad) // constructor diff --git a/src/nnfusion/core/kernels/cpu/eigen/reduce.cpp b/src/nnfusion/core/kernels/cpu/eigen/reduce.cpp index 86065f66c..bd09ef09f 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/reduce.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/reduce.cpp @@ -7,9 +7,10 @@ using namespace nnfusion; using namespace nnfusion::kernels; #define REGISTER_EW_KERNEL(OP_NAME) \ - REGISTER_KERNEL_EMITTER("" #OP_NAME "", \ - Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), \ - cpu::ReduceEigen); + REGISTER_KERNEL_EMITTER( \ + "" #OP_NAME "", \ + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), \ + cpu::ReduceEigen); //REGISTER_EW_KERNEL(Sum) //REGISTER_EW_KERNEL(Product) diff --git a/src/nnfusion/core/kernels/cpu/eigen/softmax.cpp b/src/nnfusion/core/kernels/cpu/eigen/softmax.cpp index cb7a99800..692b6904a 100644 --- a/src/nnfusion/core/kernels/cpu/eigen/softmax.cpp +++ b/src/nnfusion/core/kernels/cpu/eigen/softmax.cpp @@ -7,6 +7,6 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( - "Softmax", // op_name + "Softmax", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs - cpu::SoftmaxEigen) // constructor + cpu::SoftmaxEigen) // constructor diff --git a/src/nnfusion/core/kernels/cpu/general/anyop.cpp b/src/nnfusion/core/kernels/cpu/general/anyop.cpp index de5620391..8bbd9489c 100644 --- a/src/nnfusion/core/kernels/cpu/general/anyop.cpp +++ b/src/nnfusion/core/kernels/cpu/general/anyop.cpp @@ -35,6 +35,6 @@ LanguageUnit_p cpu::AnyOP::emit_dependency() // Register Pad kernel emitter -REGISTER_KERNEL_EMITTER("AnyOP", //op_name +REGISTER_KERNEL_EMITTER("AnyOP", //op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Priority(2), //attrs - cpu::AnyOP) // constructor + cpu::AnyOP) // constructor diff --git a/src/nnfusion/core/kernels/cpu/general/reshape.cpp b/src/nnfusion/core/kernels/cpu/general/reshape.cpp index f8e694d00..9d54d418c 100644 --- a/src/nnfusion/core/kernels/cpu/general/reshape.cpp +++ b/src/nnfusion/core/kernels/cpu/general/reshape.cpp @@ -95,6 +95,7 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("Reshape", //op_name - Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("cpu").Priority(2), //attrs - cpu::ReshapeMemcpy) //constructor +REGISTER_KERNEL_EMITTER( + "Reshape", //op_name + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("cpu").Priority(2), //attrs + cpu::ReshapeMemcpy) //constructor diff --git a/src/nnfusion/core/kernels/cpu/mkl/dot.cpp b/src/nnfusion/core/kernels/cpu/mkl/dot.cpp index eb4e77fa4..d5230fc83 100644 --- a/src/nnfusion/core/kernels/cpu/mkl/dot.cpp +++ b/src/nnfusion/core/kernels/cpu/mkl/dot.cpp @@ -177,6 +177,6 @@ LanguageUnit_p cpu::DotMkl::emit_dependency() } REGISTER_KERNEL_EMITTER( - "Dot", // op_name + "Dot", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mkl").Priority(3), // attrs cpu::DotMkl) diff --git a/src/nnfusion/core/kernels/cpu/mlas/avg_pool.cpp b/src/nnfusion/core/kernels/cpu/mlas/avg_pool.cpp index 68f8c484e..a8b9efac4 100644 --- a/src/nnfusion/core/kernels/cpu/mlas/avg_pool.cpp +++ b/src/nnfusion/core/kernels/cpu/mlas/avg_pool.cpp @@ -108,6 +108,6 @@ LanguageUnit_p cpu::AvgPoolMlas::emit_dependency() } REGISTER_KERNEL_EMITTER( - "AvgPool", // op_name + "AvgPool", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs - cpu::AvgPoolMlas) // constructor + cpu::AvgPoolMlas) // constructor diff --git a/src/nnfusion/core/kernels/cpu/mlas/batch_matmul.cpp b/src/nnfusion/core/kernels/cpu/mlas/batch_matmul.cpp index bcd21a959..3d07d1a15 100644 --- a/src/nnfusion/core/kernels/cpu/mlas/batch_matmul.cpp +++ b/src/nnfusion/core/kernels/cpu/mlas/batch_matmul.cpp @@ -114,6 +114,6 @@ LanguageUnit_p cpu::BatchMatMulMlas::emit_dependency() } REGISTER_KERNEL_EMITTER( - "BatchMatMul", // op_name + "BatchMatMul", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs cpu::BatchMatMulMlas) diff --git a/src/nnfusion/core/kernels/cpu/mlas/convolution.cpp b/src/nnfusion/core/kernels/cpu/mlas/convolution.cpp index 48f635f5a..699f6354f 100644 --- a/src/nnfusion/core/kernels/cpu/mlas/convolution.cpp +++ b/src/nnfusion/core/kernels/cpu/mlas/convolution.cpp @@ -184,6 +184,6 @@ LanguageUnit_p cpu::ConvolutionMlas::emit_dependency() } REGISTER_KERNEL_EMITTER( - "Convolution", // op_name + "Convolution", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs - cpu::ConvolutionMlas) // constructor + cpu::ConvolutionMlas) // constructor diff --git a/src/nnfusion/core/kernels/cpu/mlas/dot.cpp b/src/nnfusion/core/kernels/cpu/mlas/dot.cpp index 37ae88445..2dc3177a9 100644 --- a/src/nnfusion/core/kernels/cpu/mlas/dot.cpp +++ b/src/nnfusion/core/kernels/cpu/mlas/dot.cpp @@ -111,6 +111,6 @@ LanguageUnit_p cpu::DotMlas::emit_dependency() } REGISTER_KERNEL_EMITTER( - "Dot", // op_name + "Dot", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs cpu::DotMlas) diff --git a/src/nnfusion/core/kernels/cpu/mlas/max_pool.cpp b/src/nnfusion/core/kernels/cpu/mlas/max_pool.cpp index 85e2bc94b..7738e3065 100644 --- a/src/nnfusion/core/kernels/cpu/mlas/max_pool.cpp +++ b/src/nnfusion/core/kernels/cpu/mlas/max_pool.cpp @@ -110,6 +110,6 @@ LanguageUnit_p cpu::MaxPoolMlas::emit_dependency() } REGISTER_KERNEL_EMITTER( - "MaxPool", // op_name + "MaxPool", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs - cpu::MaxPoolMlas) // constructor + cpu::MaxPoolMlas) // constructor diff --git a/src/nnfusion/core/kernels/cpu/reference/batch_matmul.cpp b/src/nnfusion/core/kernels/cpu/reference/batch_matmul.cpp index bbff5a1bd..ddbe1e076 100644 --- a/src/nnfusion/core/kernels/cpu/reference/batch_matmul.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/batch_matmul.cpp @@ -113,9 +113,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "BatchMatMul", // op_name + "BatchMatMul", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - BatchMatMulRef) // constructor + BatchMatMulRef) // constructor } // namespace cpu } // namespace kernels diff --git a/src/nnfusion/core/kernels/cpu/reference/constant.cpp b/src/nnfusion/core/kernels/cpu/reference/constant.cpp index e7094a52f..7917d10ad 100644 --- a/src/nnfusion/core/kernels/cpu/reference/constant.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/constant.cpp @@ -69,6 +69,6 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("Constant", //op_name +REGISTER_KERNEL_EMITTER("Constant", //op_name Device(GENERIC_CPU).TypeConstraint(element::f32), //attrs - cpu::Constant) // constructor \ No newline at end of file + cpu::Constant) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cpu/reference/kernels.cpp b/src/nnfusion/core/kernels/cpu/reference/kernels.cpp index 896fa385e..9b9afd156 100644 --- a/src/nnfusion/core/kernels/cpu/reference/kernels.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/kernels.cpp @@ -2136,9 +2136,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Abs", // op_name + "Abs", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - AbsRef) // constructor + AbsRef) // constructor class AcosRef : public KernelEmitter { @@ -2174,9 +2174,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Acos", // op_name + "Acos", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - AcosRef) // constructor + AcosRef) // constructor class AddRef : public KernelEmitter { @@ -2212,9 +2212,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Add", // op_name + "Add", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - AddRef) // constructor + AddRef) // constructor class AllReduceRef : public KernelEmitter { @@ -2251,9 +2251,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "AllReduce", // op_name + "AllReduce", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - AllReduceRef) // constructor + AllReduceRef) // constructor class AsinRef : public KernelEmitter { @@ -2289,9 +2289,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Asin", // op_name + "Asin", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - AsinRef) // constructor + AsinRef) // constructor class AtanRef : public KernelEmitter { @@ -2327,9 +2327,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Atan", // op_name + "Atan", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - AtanRef) // constructor + AtanRef) // constructor class BroadcastRef : public KernelEmitter { @@ -2367,9 +2367,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Broadcast", // op_name + "Broadcast", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - BroadcastRef) // constructor + BroadcastRef) // constructor class CeilingRef : public KernelEmitter { @@ -2405,9 +2405,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Ceiling", // op_name + "Ceiling", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - CeilingRef) // constructor + CeilingRef) // constructor class ConcatRef : public KernelEmitter { @@ -2452,9 +2452,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Concat", // op_name + "Concat", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - ConcatRef) // constructor + ConcatRef) // constructor /* class ConstantRef : public KernelEmitter @@ -2529,9 +2529,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Convert", // op_name + "Convert", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - ConvertRef) // constructor + ConvertRef) // constructor class ConvolutionRef : public KernelEmitter { @@ -2574,9 +2574,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Convolution", // op_name + "Convolution", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - ConvolutionRef) // constructor + ConvolutionRef) // constructor class CosRef : public KernelEmitter { @@ -2612,9 +2612,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Cos", // op_name + "Cos", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - CosRef) // constructor + CosRef) // constructor class CoshRef : public KernelEmitter { @@ -2650,9 +2650,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Cosh", // op_name + "Cosh", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - CoshRef) // constructor + CoshRef) // constructor class DivideRef : public KernelEmitter { @@ -2688,9 +2688,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Divide", // op_name + "Divide", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - DivideRef) // constructor + DivideRef) // constructor class EqualRef : public KernelEmitter { @@ -2726,9 +2726,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Equal", // op_name + "Equal", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - EqualRef) // constructor + EqualRef) // constructor class ExpRef : public KernelEmitter { @@ -2764,9 +2764,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Exp", // op_name + "Exp", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - ExpRef) // constructor + ExpRef) // constructor class FloorRef : public KernelEmitter { @@ -2802,9 +2802,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Floor", // op_name + "Floor", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - FloorRef) // constructor + FloorRef) // constructor class GreaterRef : public KernelEmitter { @@ -2840,9 +2840,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Greater", // op_name + "Greater", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - GreaterRef) // constructor + GreaterRef) // constructor class LessRef : public KernelEmitter { @@ -2878,9 +2878,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Less", // op_name + "Less", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - LessRef) // constructor + LessRef) // constructor class LogRef : public KernelEmitter { @@ -2916,9 +2916,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Log", // op_name + "Log", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - LogRef) // constructor + LogRef) // constructor class LRNRef : public KernelEmitter { @@ -2956,9 +2956,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "LRN", // op_name + "LRN", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - LRNRef) // constructor + LRNRef) // constructor class MaxRef : public KernelEmitter { @@ -2996,9 +2996,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Max", // op_name + "Max", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - MaxRef) // constructor + MaxRef) // constructor class MaximumRef : public KernelEmitter { @@ -3034,9 +3034,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Maximum", // op_name + "Maximum", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - MaximumRef) // constructor + MaximumRef) // constructor class MinRef : public KernelEmitter { @@ -3074,9 +3074,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Min", // op_name + "Min", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - MinRef) // constructor + MinRef) // constructor class MinimumRef : public KernelEmitter { @@ -3112,9 +3112,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Minimum", // op_name + "Minimum", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - MinimumRef) // constructor + MinimumRef) // constructor class MultiplyRef : public KernelEmitter { @@ -3150,9 +3150,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Multiply", // op_name + "Multiply", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - MultiplyRef) // constructor + MultiplyRef) // constructor class NegativeRef : public KernelEmitter { @@ -3188,9 +3188,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Negative", // op_name + "Negative", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - NegativeRef) // constructor + NegativeRef) // constructor class PowerRef : public KernelEmitter { @@ -3226,9 +3226,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Power", // op_name + "Power", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - PowerRef) // constructor + PowerRef) // constructor class ProductRef : public KernelEmitter { @@ -3266,9 +3266,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Product", // op_name + "Product", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - ProductRef) // constructor + ProductRef) // constructor class ReluRef : public KernelEmitter { @@ -3304,9 +3304,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Relu", // op_name + "Relu", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - ReluRef) // constructor + ReluRef) // constructor class SelectRef : public KernelEmitter { @@ -3343,9 +3343,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Select", // op_name + "Select", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - SelectRef) // constructor + SelectRef) // constructor class SigmoidRef : public KernelEmitter { @@ -3381,9 +3381,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Sigmoid", // op_name + "Sigmoid", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - SigmoidRef) // constructor + SigmoidRef) // constructor class SignRef : public KernelEmitter { @@ -3419,9 +3419,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Sign", // op_name + "Sign", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - SignRef) // constructor + SignRef) // constructor class SinRef : public KernelEmitter { @@ -3457,9 +3457,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Sin", // op_name + "Sin", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - SinRef) // constructor + SinRef) // constructor class SinhRef : public KernelEmitter { @@ -3495,9 +3495,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Sinh", // op_name + "Sinh", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - SinhRef) // constructor + SinhRef) // constructor class SliceRef : public KernelEmitter { @@ -3536,9 +3536,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Slice", // op_name + "Slice", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - SliceRef) // constructor + SliceRef) // constructor class SoftmaxRef : public KernelEmitter { @@ -3580,9 +3580,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Softmax", // op_name + "Softmax", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - SoftmaxRef) // constructor + SoftmaxRef) // constructor class SqrtRef : public KernelEmitter { @@ -3618,9 +3618,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Sqrt", // op_name + "Sqrt", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - SqrtRef) // constructor + SqrtRef) // constructor class SubtractRef : public KernelEmitter { @@ -3656,9 +3656,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Subtract", // op_name + "Subtract", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - SubtractRef) // constructor + SubtractRef) // constructor class SumRef : public KernelEmitter { @@ -3696,9 +3696,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Sum", // op_name + "Sum", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - SumRef) // constructor + SumRef) // constructor class TanRef : public KernelEmitter { @@ -3734,9 +3734,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Tan", // op_name + "Tan", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - TanRef) // constructor + TanRef) // constructor class TanhRef : public KernelEmitter { @@ -3772,9 +3772,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Tanh", // op_name + "Tanh", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - TanhRef) // constructor + TanhRef) // constructor class BatchNormRef : public KernelEmitter { @@ -3811,7 +3811,7 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "BatchNormInference", // op_name + "BatchNormInference", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs BatchNormRef) @@ -3855,7 +3855,7 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "AvgPool", // op_name + "AvgPool", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs AvgPoolRef) @@ -3896,7 +3896,7 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Dot", // op_name + "Dot", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs DotRef) @@ -3939,7 +3939,7 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "MaxPool", // op_name + "MaxPool", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs MaxPoolRef) @@ -3981,7 +3981,7 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Pad", // op_name + "Pad", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs PadRef) @@ -4021,7 +4021,7 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Reshape", // op_name + "Reshape", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs ReshapeRef) @@ -4067,7 +4067,7 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Result", // op_name + "Result", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs ResultRef) @@ -4105,7 +4105,7 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "LessEq", // op_name + "LessEq", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs LessEqRef) @@ -4145,7 +4145,7 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Reverse", // op_name + "Reverse", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs ReverseRef) diff --git a/src/nnfusion/core/kernels/cpu/reference/one_hot.cpp b/src/nnfusion/core/kernels/cpu/reference/one_hot.cpp index ddaa06613..769d0b0cf 100644 --- a/src/nnfusion/core/kernels/cpu/reference/one_hot.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/one_hot.cpp @@ -68,9 +68,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "OneHot", // op_name + "OneHot", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - OneHotRef) // constructor + OneHotRef) // constructor } // namespace cpu } // namespace kernels diff --git a/src/nnfusion/core/kernels/cpu/reference/reduce_all.cpp b/src/nnfusion/core/kernels/cpu/reference/reduce_all.cpp index 20362bfad..2fea9d3c0 100644 --- a/src/nnfusion/core/kernels/cpu/reference/reduce_all.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/reduce_all.cpp @@ -61,9 +61,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "All", // op_name + "All", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - AllRef) // constructor + AllRef) // constructor } // namespace cpu } // namespace kernels diff --git a/src/nnfusion/core/kernels/cpu/reference/stop_gradient.cpp b/src/nnfusion/core/kernels/cpu/reference/stop_gradient.cpp index 531cea738..7678ef56b 100644 --- a/src/nnfusion/core/kernels/cpu/reference/stop_gradient.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/stop_gradient.cpp @@ -54,9 +54,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "StopGradient", // op_name + "StopGradient", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - StopGradientRef) // constructor + StopGradientRef) // constructor } // namespace cpu } // namespace kernels diff --git a/src/nnfusion/core/kernels/cpu/reference/transpose.cpp b/src/nnfusion/core/kernels/cpu/reference/transpose.cpp index 4b9508899..cd487a7c0 100644 --- a/src/nnfusion/core/kernels/cpu/reference/transpose.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/transpose.cpp @@ -113,9 +113,9 @@ namespace nnfusion }; REGISTER_KERNEL_EMITTER( - "Transpose", // op_name + "Transpose", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs - TransposeRef) // constructor + TransposeRef) // constructor } // namespace cpu } // namespace kernels diff --git a/src/nnfusion/core/kernels/cpu/reference/variable.cpp b/src/nnfusion/core/kernels/cpu/reference/variable.cpp index 2b8bf9c0c..5e16388f6 100644 --- a/src/nnfusion/core/kernels/cpu/reference/variable.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/variable.cpp @@ -67,6 +67,6 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("Variable", //op_name +REGISTER_KERNEL_EMITTER("Variable", //op_name Device(GENERIC_CPU).TypeConstraint(element::f32), //attrs - cpu::Variable) // constructor \ No newline at end of file + cpu::Variable) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cpu/simd/elementwise.cpp b/src/nnfusion/core/kernels/cpu/simd/elementwise.cpp index cd725f542..05120d0b3 100644 --- a/src/nnfusion/core/kernels/cpu/simd/elementwise.cpp +++ b/src/nnfusion/core/kernels/cpu/simd/elementwise.cpp @@ -7,9 +7,10 @@ using namespace nnfusion; using namespace nnfusion::kernels; #define REGISTER_EW_KERNEL(OP_NAME) \ - REGISTER_KERNEL_EMITTER("" #OP_NAME "", \ - Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("simd").Priority(5), \ - cpu::ElementwiseSimd); + REGISTER_KERNEL_EMITTER( \ + "" #OP_NAME "", \ + Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("simd").Priority(5), \ + cpu::ElementwiseSimd); REGISTER_EW_KERNEL(Abs) REGISTER_EW_KERNEL(Ceiling) diff --git a/src/nnfusion/core/kernels/cpu/simd/elementwise_fused.cpp b/src/nnfusion/core/kernels/cpu/simd/elementwise_fused.cpp index 1423244bf..c94b31b06 100644 --- a/src/nnfusion/core/kernels/cpu/simd/elementwise_fused.cpp +++ b/src/nnfusion/core/kernels/cpu/simd/elementwise_fused.cpp @@ -438,6 +438,6 @@ LanguageUnit_p ElementwiseFused::emit_comments() } REGISTER_KERNEL_EMITTER( - "ElementwiseFused", // op_name + "ElementwiseFused", // op_name Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("simd").Priority(5), // attrs cpu::ElementwiseFused) diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp index f8891b025..548e1e4e0 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp @@ -31,7 +31,9 @@ std::string cuda::get_cudnn_datatype(element::Type dtype) return p->second; } -LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape, string desc, element::Type type) +LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape, + string desc, + element::Type type) { LanguageUnit_p _lu(new LanguageUnit); auto& lu = *_lu; @@ -92,7 +94,8 @@ LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& s return _lu; } -LanguageUnit_p cuda::get_cudnn_filter_descriptor(const Shape& shape, string desc, element::Type type) +LanguageUnit_p + cuda::get_cudnn_filter_descriptor(const Shape& shape, string desc, element::Type type) { LanguageUnit_p _lu(new LanguageUnit); auto& lu = *_lu; diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp index fc8cdd8dc..3c5a1e013 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp @@ -16,13 +16,15 @@ namespace nnfusion std::string get_cudnn_datatype(element::Type type); LanguageUnit_p cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape, string desc, - element::Type type = element::f32); + element::Type type); LanguageUnit_p get_cudnn_convolution_descriptor(const Shape& padding, const Strides& window_movement_strides, const Strides& window_dilation_strides, string desc, element::Type type = element::f32); - LanguageUnit_p get_cudnn_filter_descriptor(const Shape& shape, string desc, element::Type type = element::f32); + LanguageUnit_p get_cudnn_filter_descriptor(const Shape& shape, + string desc, + element::Type type = element::f32); LanguageUnit_p get_dropout_global_states(float ratio); inline std::string ratio2str(float ratio) { diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp index 565c18c49..ac8b9e90a 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp @@ -253,9 +253,8 @@ __device__ __forceinline__ int64_t load(const int64_t* __restrict__ in, int i= } )"); -LU_DEFINE( - declaration::cuda_fp16_scale, - R"( +LU_DEFINE(declaration::cuda_fp16_scale, + R"( __global__ void nnfusionHalfScaleKernel(half *x, half *alpha, size_t count) { size_t offset = threadIdx.x + blockIdx.x * blockDim.x; @@ -270,8 +269,7 @@ void nnfusionHalfScale(half *x, half *alpha, size_t len) { nnfusionHalfScaleKernel<<<(len+255)/256, 256>>>(x, alpha, len); } - )" -) + )") LU_DEFINE_EXTEND(declaration::cuda_reduce_primitive, R"( diff --git a/src/nnfusion/core/kernels/cuda_gpu/inl/generate_kernel_code-inl.hpp b/src/nnfusion/core/kernels/cuda_gpu/inl/generate_kernel_code-inl.hpp index 9ef0d123c..7d9c2131d 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/inl/generate_kernel_code-inl.hpp +++ b/src/nnfusion/core/kernels/cuda_gpu/inl/generate_kernel_code-inl.hpp @@ -64,6 +64,6 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER(__KernelOpType__, // op_name +REGISTER_KERNEL_EMITTER(__KernelOpType__, // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), // attrs - cuda::__KernelUniqueClassName__) // constructor + cuda::__KernelUniqueClassName__) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/addn.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/addn.cpp index b19c6d2ae..9972df1af 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/addn.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/addn.cpp @@ -89,6 +89,7 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("AddN", - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::AddN) +REGISTER_KERNEL_EMITTER( + "AddN", + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), + cuda::AddN) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/allreduce.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/allreduce.cpp index 525968ab2..383c9b600 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/allreduce.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/allreduce.cpp @@ -49,6 +49,6 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("AllReduce", //op_name +REGISTER_KERNEL_EMITTER("AllReduce", //op_name Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs - cuda::SuperScalerAllReduce) // constructor + cuda::SuperScalerAllReduce) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/anyop.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/anyop.cpp index 1adc0952c..2bf5882e7 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/anyop.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/anyop.cpp @@ -34,6 +34,6 @@ LanguageUnit_p cuda::AnyOP::emit_dependency() // Register Pad kernel emitter -REGISTER_KERNEL_EMITTER("AnyOP", //op_name +REGISTER_KERNEL_EMITTER("AnyOP", //op_name Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs - cuda::AnyOP) // constructor + cuda::AnyOP) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp index 566e1b535..e42e0eda9 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp @@ -105,6 +105,7 @@ if(i == 0) using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("ApplyAdam", - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::ApplyAdam) \ No newline at end of file +REGISTER_KERNEL_EMITTER( + "ApplyAdam", + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), + cuda::ApplyAdam) \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_gradient_descent.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_gradient_descent.cpp index 05dcc1087..908502b41 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_gradient_descent.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_gradient_descent.cpp @@ -75,9 +75,11 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("ApplyGradient", - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::ApplyGradientDescent) -REGISTER_KERNEL_EMITTER("ApplyGradientDescent", - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::ApplyGradientDescent) +REGISTER_KERNEL_EMITTER( + "ApplyGradient", + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), + cuda::ApplyGradientDescent) +REGISTER_KERNEL_EMITTER( + "ApplyGradientDescent", + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), + cuda::ApplyGradientDescent) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_momentum.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_momentum.cpp index 72716bd4e..a506653f1 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_momentum.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_momentum.cpp @@ -79,6 +79,7 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("ApplyMomentum", - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::ApplyMomentum) +REGISTER_KERNEL_EMITTER( + "ApplyMomentum", + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), + cuda::ApplyMomentum) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/assign.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/assign.cpp index f22ba886b..914b2f7cb 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/assign.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/assign.cpp @@ -75,6 +75,7 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("Assign", - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::Assign) +REGISTER_KERNEL_EMITTER( + "Assign", + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), + cuda::Assign) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/assign_sub.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/assign_sub.cpp index d21de903d..4e380a74f 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/assign_sub.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/assign_sub.cpp @@ -75,6 +75,7 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("AssignSub", - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::AssignSub) +REGISTER_KERNEL_EMITTER( + "AssignSub", + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), + cuda::AssignSub) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp index bc2c56b1b..479156586 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp @@ -50,8 +50,8 @@ cuda::AvgPool1D::AvgPool1D(shared_ptr ctx) padding_above = nnfusion::Shape(avg_pool->get_padding_above()); window_stride = nnfusion::Strides(avg_pool->get_window_movement_strides()); include_pad = avg_pool->get_include_padding_in_avg_computation(); - input_type = ctx->inputs[0]->get_element_type().c_type_string(); - output_type = ctx->outputs[0]->get_element_type().c_type_string(); + input_type = ctx->inputs[0]->get_element_type(); + output_type = ctx->outputs[0]->get_element_type(); // NNFUSION_CHECK(input_shape.size() == 3) // << "Input shape size of AvgPool1D is invalid, shape size: " << input_shape.size() @@ -265,16 +265,16 @@ cuda::AvgPoolmD::AvgPoolmD(shared_ptr ctx) padding_above = nnfusion::Shape(avg_pool->get_padding_above()); window_stride = nnfusion::Strides(avg_pool->get_window_movement_strides()); include_pad = avg_pool->get_include_padding_in_avg_computation(); - input_type = ctx->inputs[0]->get_element_type().c_type_string(); - output_type = ctx->outputs[0]->get_element_type().c_type_string(); + input_type = ctx->inputs[0]->get_element_type(); + output_type = ctx->outputs[0]->get_element_type(); NNFUSION_CHECK(input_shape.size() == 4 || input_shape.size() == 5) << "Input shape size of AvgPoolmD is invalid, shape size: " << input_shape.size() << "expected 4 or 5"; std::stringstream tag; - tag << "cudnn_avgpool_dtype_" << output_type << "_i" << join(input_shape, "_") << "_o" - << join(output_shape, "_") << "_ws" << join(window_stride, "_") << "_wst" + tag << "cudnn_avgpool_dtype_" << output_type.c_type_string() << "_i" << join(input_shape, "_") + << "_o" << join(output_shape, "_") << "_ws" << join(window_stride, "_") << "_wst" << join(window_stride, "_") << "_pb" << join(padding_below, "_") << "_pb" << join(padding_above, "_"); custom_tag = tag.str(); @@ -288,8 +288,8 @@ LanguageUnit_p cuda::AvgPoolmD::emit_function_body() auto cudnn_avg_type = include_pad ? "CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING" : "CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING"; - auto input_desc = cudnn_tensor_descriptor_from_shape(input_shape, "input_desc"); - auto output_desc = cudnn_tensor_descriptor_from_shape(output_shape, "output_desc"); + auto input_desc = cudnn_tensor_descriptor_from_shape(input_shape, "input_desc", input_type); + auto output_desc = cudnn_tensor_descriptor_from_shape(output_shape, "output_desc", output_type); lu << input_desc->get_code(); lu << output_desc->get_code(); @@ -404,11 +404,11 @@ LanguageUnit_p cuda::AvgPoolmD::emit_function_signature() } REGISTER_KERNEL_EMITTER( - "AvgPool", // op_name + "AvgPool", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::AvgPool1D) // constructor + cuda::AvgPool1D) // constructor REGISTER_KERNEL_EMITTER( - "AvgPool", // op_name + "AvgPool", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs - cuda::AvgPoolmD) // constructor + cuda::AvgPoolmD) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.hpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.hpp index da1a55717..68c45e233 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.hpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.hpp @@ -54,7 +54,7 @@ namespace nnfusion padding_above; nnfusion::Strides window_stride; bool include_pad; - string input_type, output_type; + element::Type input_type, output_type; // Precompute for fast constant memory access. int HW, DHW, CDHW, PQ, MPQ, KMPQ, RS, TRS; @@ -78,7 +78,7 @@ namespace nnfusion padding_above; nnfusion::Strides window_stride; bool include_pad; - string input_type, output_type; + element::Type input_type, output_type; }; } // namespace cuda } // namespace kernels diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp index ad81ec3d8..173e95e93 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp @@ -192,11 +192,11 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( - "BatchMatMul", // op_name + "BatchMatMul", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::BatchMatMul) // constructor + cuda::BatchMatMul) // constructor REGISTER_KERNEL_EMITTER( - "BatchMatMul", // op_name + "BatchMatMul", // op_name Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::BatchMatMul) // constructor + cuda::BatchMatMul) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp index 6a64bee78..050a27a58 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp @@ -28,7 +28,7 @@ LanguageUnit_p cuda::BatchNorm::emit_function_body() { LanguageUnit_p _lu(new LanguageUnit(get_function_name())); auto& lu = *_lu; - auto tensor_desc = cudnn_tensor_descriptor_from_shape(tensor_shape, "tensor_desc"); + auto tensor_desc = cudnn_tensor_descriptor_from_shape(tensor_shape, "tensor_desc", dtype); lu << tensor_desc->get_code(); // derived_param_desc lu << "cudnnTensorDescriptor_t derived_param_desc;\n"; @@ -201,12 +201,15 @@ void cuda::BatchNormNCHW::set_launch_config() using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("BatchNormInference", // op_name - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn").Priority(2), // attrs - cuda::BatchNorm) // constructor -REGISTER_KERNEL_EMITTER("BatchNormInference", // op_name - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda").Priority(2), // attrs - cuda::BatchNormNCHW) // constructor -REGISTER_KERNEL_EMITTER("BatchNormInference", // op_name - Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda").Priority(2), // attrs - cuda::BatchNormNCHW) // constructor +REGISTER_KERNEL_EMITTER( + "BatchNormInference", // op_name + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn").Priority(2), // attrs + cuda::BatchNorm) // constructor +REGISTER_KERNEL_EMITTER( + "BatchNormInference", // op_name + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda").Priority(2), // attrs + cuda::BatchNormNCHW) // constructor +REGISTER_KERNEL_EMITTER( + "BatchNormInference", // op_name + Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda").Priority(2), // attrs + cuda::BatchNormNCHW) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/blockfusion_fused.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/blockfusion_fused.cpp index 22cea1999..877b84477 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/blockfusion_fused.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/blockfusion_fused.cpp @@ -71,6 +71,6 @@ void BlockFusionFused::set_launch_config() } REGISTER_KERNEL_EMITTER( - "BlockFusionFused", // op_name + "BlockFusionFused", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::BlockFusionFused) \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/broadcast.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/broadcast.cpp index 08660ed41..f839422c7 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/broadcast.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/broadcast.cpp @@ -194,10 +194,10 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("Broadcast", //op_name +REGISTER_KERNEL_EMITTER("Broadcast", //op_name Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs - cuda::Broadcast) // constructor + cuda::Broadcast) // constructor -REGISTER_KERNEL_EMITTER("Broadcast", //op_name +REGISTER_KERNEL_EMITTER("Broadcast", //op_name Device(ROCM_GPU).TypeConstraint(element::f32).Priority(2), //attrs - cuda::RocmBroadcast) // constructor + cuda::RocmBroadcast) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/concat.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/concat.cpp index 3b39408e0..6d7791f61 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/concat.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/concat.cpp @@ -324,9 +324,9 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("Concat", //op_name +REGISTER_KERNEL_EMITTER("Concat", //op_name Device(CUDA_GPU).TypeConstraint(element::f32), //attrs - cuda::Concat) // constructor + cuda::Concat) // constructor namespace nnfusion { @@ -479,6 +479,6 @@ namespace nnfusion } // namespace kernels } // namespace nnfusion -REGISTER_KERNEL_EMITTER("Concat", //op_name +REGISTER_KERNEL_EMITTER("Concat", //op_name Device(CUDA_GPU).TypeConstraint(element::f32), //attrs - cuda::ConcatKernel) // constructor + cuda::ConcatKernel) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/concat_offset.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/concat_offset.cpp index 586091381..2c812666b 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/concat_offset.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/concat_offset.cpp @@ -73,6 +73,6 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( - "ConcatOffset", // op_name + "ConcatOffset", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::ConcatOffset) // constructor + cuda::ConcatOffset) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp index d6e0d9000..035b00bb7 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp @@ -119,6 +119,6 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("Constant", //op_name +REGISTER_KERNEL_EMITTER("Constant", //op_name Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs - cuda::Constant) // constructor \ No newline at end of file + cuda::Constant) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp index 7ddcb0426..6b91e3956 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp @@ -15,8 +15,10 @@ cuda::ConvolutionCudnn::ConvolutionCudnn(shared_ptr ctx) input_type = ctx->inputs[0]->get_element_type(); filter_type = ctx->inputs[1]->get_element_type(); output_type = ctx->outputs[0]->get_element_type(); - NNFUSION_CHECK(input_type == filter_type && input_type == output_type) - << "Convolution input datatype (" << input_type << ") should be the same with that of filter (" << filter_type << "), and that of output (" << output_type << ")."; + NNFUSION_CHECK(input_type == filter_type && input_type == output_type) + << "Convolution input datatype (" << input_type + << ") should be the same with that of filter (" << filter_type << "), and that of output (" + << output_type << ")."; conv_type = input_type; input_shape = ctx->inputs[0]->get_shape(); filter_shape = ctx->inputs[1]->get_shape(); @@ -85,14 +87,18 @@ LanguageUnit_p cuda::ConvolutionCudnn::emit_function_body() padding_below[i] = static_cast(padding_below_diff[i]); } - { // lu << "cudnnDataType_t data_type = " << get_cudnn_datatype(dtype) << ";\n"; - lu << cudnn_tensor_descriptor_from_shape(input_shape, "tensor_desc_0", input_type)->get_code(); - lu << cudnn_tensor_descriptor_from_shape(output_shape, "tensor_desc_1", output_type)->get_code(); + lu << cudnn_tensor_descriptor_from_shape(input_shape, "tensor_desc_0", input_type) + ->get_code(); + lu << cudnn_tensor_descriptor_from_shape(output_shape, "tensor_desc_1", output_type) + ->get_code(); lu << get_cudnn_filter_descriptor(filter_shape, "filter_desc", filter_type)->get_code(); - lu << get_cudnn_convolution_descriptor( - padding_below, window_movement_strides, window_dilation_strides, "conv_desc", conv_type) + lu << get_cudnn_convolution_descriptor(padding_below, + window_movement_strides, + window_dilation_strides, + "conv_desc", + conv_type) ->get_code(); lu << R"( @@ -213,11 +219,11 @@ LanguageUnit_p cuda::ConvolutionCudnn::emit_function_signature() } REGISTER_KERNEL_EMITTER( - "Convolution", // op_name + "Convolution", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs - cuda::ConvolutionCudnn) // constructor + cuda::ConvolutionCudnn) // constructor REGISTER_KERNEL_EMITTER( - "Convolution", // op_name + "Convolution", // op_name Device(CUDA_GPU).TypeConstraint(element::f16).Tag("cudnn_kernel").Priority(2), // attrs - cuda::ConvolutionCudnn) // constructor + cuda::ConvolutionCudnn) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp index 2b1dce365..d4ccbead1 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp @@ -372,6 +372,6 @@ LanguageUnit_p cuda::DepthwiseConv2dNative::emit_dependency() } REGISTER_KERNEL_EMITTER( - "DepthwiseConv2dNative", // op_name + "DepthwiseConv2dNative", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::DepthwiseConv2dNative) // constructor + cuda::DepthwiseConv2dNative) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp index 2f07207c3..7a9bef553 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp @@ -203,7 +203,9 @@ LanguageUnit_p cuda::Dot::emit_function_body() << " static_cast(output0)," << " " << n << "));\n"; } - } else if (dtype == element::f16) { + } + else if (dtype == element::f16) + { // case 1: Scalar * Tensor // if (arg0_shape.empty() || arg1_shape.empty()) // { @@ -282,86 +284,87 @@ LanguageUnit_p cuda::Dot::emit_function_body() // << " static_cast(output0)," // << " " << m << "));\n"; // } else { - size_t axes_for_m_count = arg0_shape.size() - reduction_axes; - size_t axes_for_n_count = arg1_shape.size() - reduction_axes; - size_t axes_for_k_count = reduction_axes; - size_t m = 1; - size_t n = 1; - size_t k = 1; - - // check if input and output size correct - // check and calculate k for arg0 and arg1 - size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k - size_t arg1_k_idx = 0; // first axe in arg1 for k - - for (size_t i = 0; i < axes_for_k_count; i++) + size_t axes_for_m_count = arg0_shape.size() - reduction_axes; + size_t axes_for_n_count = arg1_shape.size() - reduction_axes; + size_t axes_for_k_count = reduction_axes; + size_t m = 1; + size_t n = 1; + size_t k = 1; + + // check if input and output size correct + // check and calculate k for arg0 and arg1 + size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k + size_t arg1_k_idx = 0; // first axe in arg1 for k + + for (size_t i = 0; i < axes_for_k_count; i++) + { + k *= arg0_shape[arg0_k_idx]; + if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++]) { - k *= arg0_shape[arg0_k_idx]; - if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++]) - { - std::vector arg_vec{"arg0", "arg1"}; - std::vector shape_vec{arg0_shape, arg1_shape}; + std::vector arg_vec{"arg0", "arg1"}; + std::vector shape_vec{arg0_shape, arg1_shape}; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; - } + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; } - // check and calculate m for arg0 and out - size_t arg0_m_idx = 0; // first axe in arg0 for m - size_t out_m_idx = 0; // first axe in out for m - for (size_t i = 0; i < axes_for_m_count; i++) + } + // check and calculate m for arg0 and out + size_t arg0_m_idx = 0; // first axe in arg0 for m + size_t out_m_idx = 0; // first axe in out for m + for (size_t i = 0; i < axes_for_m_count; i++) + { + m *= arg0_shape[arg0_m_idx]; + if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++]) { - m *= arg0_shape[arg0_m_idx]; - if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++]) - { - std::vector arg_vec{"arg0", "output"}; - std::vector shape_vec{arg0_shape, out_shape}; + std::vector arg_vec{"arg0", "output"}; + std::vector shape_vec{arg0_shape, out_shape}; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; - } + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; } - // check and calculate n for arg1 and out - size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n - size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n - for (size_t i = 0; i < axes_for_n_count; i++) + } + // check and calculate n for arg1 and out + size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n + size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n + for (size_t i = 0; i < axes_for_n_count; i++) + { + n *= arg1_shape[arg1_n_idx]; + if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++]) { - n *= arg1_shape[arg1_n_idx]; - if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++]) - { - std::vector arg_vec{"arg1", "output"}; - std::vector shape_vec{arg1_shape, out_shape}; + std::vector arg_vec{"arg1", "output"}; + std::vector shape_vec{arg1_shape, out_shape}; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; - } + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; } + } - lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; - - lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," - << " CUBLAS_OP_N," - << " CUBLAS_OP_N," - << " " << n << "," - << " " << m << "," - << " " << k << "," - << " &alpha," - << " static_cast(input1)," - << " " << n << "," - << " static_cast(input0)," - << " " << k << "," - << " &beta," - << " static_cast(output0)," - << " " << n << "));\n"; + lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; + + lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," + << " CUBLAS_OP_N," + << " CUBLAS_OP_N," + << " " << n << "," + << " " << m << "," + << " " << k << "," + << " &alpha," + << " static_cast(input1)," + << " " << n << "," + << " static_cast(input0)," + << " " << k << "," + << " &beta," + << " static_cast(output0)," + << " " << n << "));\n"; // } - - } else { + } + else + { NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot."; } //lu.block_end(); @@ -419,16 +422,16 @@ LanguageUnit_p cuda::Dot::emit_function_signature() } REGISTER_KERNEL_EMITTER( - "Dot", // op_name + "Dot", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs - cuda::Dot) // constructor + cuda::Dot) // constructor REGISTER_KERNEL_EMITTER( - "Dot", // op_name + "Dot", // op_name Device(CUDA_GPU).TypeConstraint(element::f16).Tag("cublas").Priority(2), // attrs - cuda::Dot) // constructor + cuda::Dot) // constructor REGISTER_KERNEL_EMITTER( - "Dot", // op_name + "Dot", // op_name Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs - cuda::Dot) // constructor + cuda::Dot) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dropout.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dropout.cpp index 25b4c51ce..96a5e976e 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dropout.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dropout.cpp @@ -253,10 +253,10 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("DropoutTraining", // op_name +REGISTER_KERNEL_EMITTER("DropoutTraining", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn"), // attrs - cuda::DropoutTraining) // constructor + cuda::DropoutTraining) // constructor -REGISTER_KERNEL_EMITTER("DropoutTrainingGrad", // op_name +REGISTER_KERNEL_EMITTER("DropoutTrainingGrad", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn"), // attrs - cuda::DropoutTrainingGrad) // constructor + cuda::DropoutTrainingGrad) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp index 96385a2f2..4bd847949 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp @@ -122,6 +122,6 @@ LanguageUnit_p cuda::DynamicStitch::emit_dependency() } REGISTER_KERNEL_EMITTER( - "DynamicStitch", // op_name + "DynamicStitch", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::DynamicStitch) // constructor \ No newline at end of file + cuda::DynamicStitch) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.cpp index 9597d122e..4f5b2b6cc 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.cpp @@ -9,7 +9,7 @@ using namespace nnfusion::kernels; #define REGISTER_EW_KERNEL(OP_NAME) \ REGISTER_KERNEL_EMITTER( \ "" #OP_NAME "", \ - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("element_wise").Priority(2), \ + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("element_wise").Priority(2), \ cuda::ElementWise); REGISTER_EW_KERNEL(Abs) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp index c4c3b0bdc..b29e0fa16 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp @@ -340,6 +340,6 @@ void ElementWiseFused::compute_best_config(int& grids, int& blocks, int& bound) } REGISTER_KERNEL_EMITTER( - "ElementWiseFused", // op_name + "ElementWiseFused", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs cuda::ElementWiseFused) \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_1d.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_1d.cpp index 94fa0e506..8b1d4836c 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_1d.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_1d.cpp @@ -115,9 +115,9 @@ LanguageUnit_p cuda::Gather1D::emit_dependency() } REGISTER_KERNEL_EMITTER( - "GatherV2", // op_name + "GatherV2", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::Gather1D) // constructor + cuda::Gather1D) // constructor cuda::Gather1DGrad::Gather1DGrad(shared_ptr ctx) : BlockCudaEmitter(ctx) @@ -228,6 +228,6 @@ LanguageUnit_p cuda::Gather1DGrad::emit_dependency() } REGISTER_KERNEL_EMITTER( - "GatherGrad", // op_name + "GatherGrad", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::Gather1DGrad) // constructor + cuda::Gather1DGrad) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_nd.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_nd.cpp index 6743691cf..4b36dba4f 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_nd.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_nd.cpp @@ -278,10 +278,10 @@ atomic_add(output0 + x_offset, __ldg(input1 + y_offset)); using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("GatherND", // op_name +REGISTER_KERNEL_EMITTER("GatherND", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), // attrs - cuda::GatherND) // constructor + cuda::GatherND) // constructor -REGISTER_KERNEL_EMITTER("GatherNDGrad", // op_name +REGISTER_KERNEL_EMITTER("GatherNDGrad", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), // attrs - cuda::GatherNDGrad) // constructor + cuda::GatherNDGrad) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/invert_permutation.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/invert_permutation.cpp index 6dd9bba2e..e20dd2492 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/invert_permutation.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/invert_permutation.cpp @@ -65,6 +65,7 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("InvertPermutation", - Device(CUDA_GPU).TypeConstraint(element::f32).Priority( - 2), // TODO: this op input and output will all be int + Device(CUDA_GPU) + .TypeConstraint(element::f32) + .Priority(2), // TODO: this op input and output will all be int cuda::InvertPermutation) \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/layer_norm.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/layer_norm.cpp index bc9a6f463..5e39b3b65 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/layer_norm.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/layer_norm.cpp @@ -77,6 +77,6 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("LayerNorm", // op_name +REGISTER_KERNEL_EMITTER("LayerNorm", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudalib"), // attrs - cuda::LayerNorm) // constructor + cuda::LayerNorm) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp index 76af4bb3e..9bce41fe7 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp @@ -31,8 +31,8 @@ cuda::MaxPool1D::MaxPool1D(shared_ptr ctx) // << "expected 3"; std::stringstream tag; - tag << "cuda_maxpool_" << input_type.c_type_string() << "_" << output_type.c_type_string() << "_iw" - << std::to_string(input_width) << "_ow" << std::to_string(output_width) << "_ww" + tag << "cuda_maxpool_" << input_type.c_type_string() << "_" << output_type.c_type_string() + << "_iw" << std::to_string(input_width) << "_ow" << std::to_string(output_width) << "_ww" << std::to_string(window_width) << "_wst" << std::to_string(window_stride_width); custom_tag = tag.str(); } @@ -53,7 +53,8 @@ LanguageUnit_p cuda::MaxPool1D::emit_function_body() // Index into input tensor. lu << "size_t start = (tid / " << output_width << ") * " << input_width << " + " << " (tid % " << output_width << ") * " << window_stride[0] << ";\n"; - lu << input_type.c_type_string() << " max_val = " << TypeInfo::Get(input_type)->lowest() << ";\n"; + lu << input_type.c_type_string() << " max_val = " << TypeInfo::Get(input_type)->lowest() + << ";\n"; lu << "for (size_t i = start; i < start + " << window_width << "; i++)\n"; lu.block_begin(); { @@ -108,8 +109,8 @@ cuda::MaxPoolmD::MaxPoolmD(shared_ptr ctx) window_stride = nnfusion::Strides(max_pool->get_window_movement_strides()); std::stringstream tag; - tag << "cudnn_maxpool_dtype_" << output_type.c_type_string() << "_i" << join(input_shape, "_") << "_o" - << join(output_shape, "_") << "_ws" << join(window_shape, "_") << "_wst" + tag << "cudnn_maxpool_dtype_" << output_type.c_type_string() << "_i" << join(input_shape, "_") + << "_o" << join(output_shape, "_") << "_ws" << join(window_shape, "_") << "_wst" << join(window_stride, "_") << "_pb" << join(padding_below, "_") << "_pb" << join(padding_above, "_"); custom_tag = tag.str(); @@ -239,11 +240,11 @@ LanguageUnit_p cuda::MaxPoolmD::emit_function_signature() } REGISTER_KERNEL_EMITTER( - "MaxPool", // op_name + "MaxPool", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::MaxPool1D) // constructor + cuda::MaxPool1D) // constructor REGISTER_KERNEL_EMITTER( - "MaxPool", // op_name + "MaxPool", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs - cuda::MaxPoolmD) // constructor + cuda::MaxPoolmD) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/one_hot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/one_hot.cpp index 1368e1244..2cccc00f2 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/one_hot.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/one_hot.cpp @@ -109,6 +109,6 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( - "OneHot", // op_name + "OneHot", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::OneHot) // constructor + cuda::OneHot) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp index 0ac7149dc..faab94fe9 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp @@ -147,6 +147,6 @@ KernelRegistrar kernel_registrar0( */ REGISTER_KERNEL_EMITTER( - "Pad", // op_name + "Pad", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::Pad) // constructor \ No newline at end of file + cuda::Pad) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp index 397eeb9b4..1c5a30279 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp @@ -63,6 +63,6 @@ LanguageUnit_p cuda::Range::emit_dependency() return _lu; } REGISTER_KERNEL_EMITTER( - "Range", // op_name + "Range", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::Range) // constructor \ No newline at end of file + cuda::Range) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.cpp index 15dd2d3ce..835754c46 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.cpp @@ -6,56 +6,62 @@ using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("Max", - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::Reduce) +REGISTER_KERNEL_EMITTER( + "Max", + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), + cuda::Reduce) REGISTER_KERNEL_EMITTER( - "Max", // op_name + "Max", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs cuda::ReduceMemcpy) -REGISTER_KERNEL_EMITTER("Min", - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::Reduce) +REGISTER_KERNEL_EMITTER( + "Min", + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), + cuda::Reduce) REGISTER_KERNEL_EMITTER( - "Min", // op_name + "Min", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs cuda::ReduceMemcpy) -REGISTER_KERNEL_EMITTER("Product", - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::Reduce) +REGISTER_KERNEL_EMITTER( + "Product", + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), + cuda::Reduce) REGISTER_KERNEL_EMITTER( - "Product", // op_name + "Product", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs cuda::ReduceMemcpy) -REGISTER_KERNEL_EMITTER("Sum", - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::Reduce) +REGISTER_KERNEL_EMITTER( + "Sum", + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), + cuda::Reduce) REGISTER_KERNEL_EMITTER( - "Sum", // op_name + "Sum", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs cuda::ReduceMemcpy) -REGISTER_KERNEL_EMITTER("Sum", - Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::Reduce) +REGISTER_KERNEL_EMITTER( + "Sum", + Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), + cuda::Reduce) REGISTER_KERNEL_EMITTER( - "Sum", // op_name + "Sum", // op_name Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs cuda::ReduceMemcpy) -REGISTER_KERNEL_EMITTER("ReduceAny", - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::Reduce) +REGISTER_KERNEL_EMITTER( + "ReduceAny", + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), + cuda::Reduce) REGISTER_KERNEL_EMITTER( - "ReduceAny", // op_name + "ReduceAny", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs cuda::ReduceMemcpy) \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce_all.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce_all.cpp index 6760325ec..1298e6fbf 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce_all.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce_all.cpp @@ -109,6 +109,6 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( - "All", // op_name + "All", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::All) // constructor + cuda::All) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reshape.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reshape.cpp index 168a91011..aa9fc49e4 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reshape.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reshape.cpp @@ -557,21 +557,21 @@ LanguageUnit_p cuda::ReshapeMemcpy::emit_function_signature() // Register Reshape kernel emitter REGISTER_KERNEL_EMITTER( - "Reshape", // op_name + "Reshape", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel_2D").Priority(2), // attrs - cuda::Reshape2D) // constructor + cuda::Reshape2D) // constructor REGISTER_KERNEL_EMITTER( - "Reshape", // op_name + "Reshape", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel_3D").Priority(2), // attrs - cuda::Reshape3D) // constructor + cuda::Reshape3D) // constructor REGISTER_KERNEL_EMITTER( - "Reshape", // op_name + "Reshape", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel_D").Priority(2), // attrs - cuda::ReshapehD) // constructor + cuda::ReshapehD) // constructor REGISTER_KERNEL_EMITTER( - "Reshape", // op_name + "Reshape", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs - cuda::ReshapeMemcpy) // constructor + cuda::ReshapeMemcpy) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp index 46e81ade7..f3429c956 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp @@ -84,6 +84,6 @@ LanguageUnit_p cuda::Result::emit_dependency() } REGISTER_KERNEL_EMITTER( - "Result", // op_name + "Result", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs - cuda::Result) // constructor \ No newline at end of file + cuda::Result) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp index 36f2d39b5..6d5fc374d 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp @@ -101,6 +101,6 @@ LanguageUnit_p cuda::Reverse::emit_dependency() } REGISTER_KERNEL_EMITTER( - "Reverse", // op_name + "Reverse", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::Reverse) // constructor \ No newline at end of file + cuda::Reverse) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp index 6f0e0e2bc..487951930 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp @@ -124,10 +124,10 @@ if (tid < @threads@) { using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( - "ReverseSequence", // op_name + "ReverseSequence", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::ReverseSequence) // constructor + cuda::ReverseSequence) // constructor -REGISTER_KERNEL_EMITTER("ReverseSequence", // op_name +REGISTER_KERNEL_EMITTER("ReverseSequence", // op_name Device(ROCM_GPU).TypeConstraint(element::f32).Priority(2), // attrs - cuda::RocmReverseSequence) // constructor \ No newline at end of file + cuda::RocmReverseSequence) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/batch_gemm_fixed.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/batch_gemm_fixed.cpp index 5ed2d8ea4..de4ec59e9 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/batch_gemm_fixed.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/batch_gemm_fixed.cpp @@ -112,6 +112,6 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("BatchMatMul", // op_name +REGISTER_KERNEL_EMITTER("BatchMatMul", // op_name Device(ROCM_GPU).TypeConstraint(element::f32).Priority(4), // attrs - cuda::BatchGemmFixed) // constructor + cuda::BatchGemmFixed) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_host.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_host.cpp index 63dd091e4..f8658791d 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_host.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_host.cpp @@ -172,6 +172,6 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("Broadcast", //op_name +REGISTER_KERNEL_EMITTER("Broadcast", //op_name Device(ROCM_GPU).TypeConstraint(element::f32).Priority(3), //attrs - cuda::RocmBiasBroadcast) // constructor + cuda::RocmBiasBroadcast) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_kernel.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_kernel.cpp index f5f8b3a50..7c2318f9c 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_kernel.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_kernel.cpp @@ -295,6 +295,6 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("Broadcast", // op_name +REGISTER_KERNEL_EMITTER("Broadcast", // op_name Device(ROCM_GPU).TypeConstraint(element::f32).Priority(4), // attrs - cuda::RocmManualBroadcast) // constructor + cuda::RocmManualBroadcast) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convfwd_fixed.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convfwd_fixed.cpp index 04fb041b8..01bf11715 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convfwd_fixed.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convfwd_fixed.cpp @@ -141,6 +141,6 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( - "Convolution", // op_name + "Convolution", // op_name Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::ConvFwdFixed) // constructor + cuda::ConvFwdFixed) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convolution.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convolution.cpp index 2b19b8db0..6abefd6e6 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convolution.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convolution.cpp @@ -226,6 +226,6 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( - "Convolution", // op_name + "Convolution", // op_name Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs - cuda::RocmConvolutionCudnn) // constructor + cuda::RocmConvolutionCudnn) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/gemm_fixed.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/gemm_fixed.cpp index 983b7fce3..1fd205a99 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/gemm_fixed.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/gemm_fixed.cpp @@ -207,6 +207,6 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( - "Dot", // op_name + "Dot", // op_name Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::GemmFixed) // constructor + cuda::GemmFixed) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/reduce_sum.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/reduce_sum.cpp index a476ddd06..d12253ba7 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/reduce_sum.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/reduce_sum.cpp @@ -335,7 +335,7 @@ using namespace nnfusion::kernels; #define REGISTER_GPU_KERNEL(KEY, OP_NAME) \ REGISTER_KERNEL_EMITTER(KEY, \ - Device(ROCM_GPU).TypeConstraint(element::f32).Priority(4), \ + Device(ROCM_GPU).TypeConstraint(element::f32).Priority(4), \ cuda::RocmReduce) REGISTER_GPU_KERNEL("Sum", Add) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/softmax.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/softmax.cpp index a2a54782d..c10db2e19 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/softmax.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/softmax.cpp @@ -134,6 +134,6 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( - "Softmax", // op_name + "Softmax", // op_name Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::RocmSoftmax) // constructor + cuda::RocmSoftmax) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/scatter.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/scatter.cpp index fb2fd5930..b88350fda 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/scatter.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/scatter.cpp @@ -97,9 +97,10 @@ using namespace nnfusion; using namespace nnfusion::kernels; #define REGISTER_SCATTER_KERNEL(OP_NAME, KERNEL_NAME) \ - REGISTER_KERNEL_EMITTER("" #KERNEL_NAME "", \ - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("scatter").Priority(2), \ - cuda::Scatter); + REGISTER_KERNEL_EMITTER( \ + "" #KERNEL_NAME "", \ + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("scatter").Priority(2), \ + cuda::Scatter); REGISTER_SCATTER_KERNEL(Subtract, ScatterSub); REGISTER_SCATTER_KERNEL(Add, ScatterAdd); diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/slice.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/slice.cpp index d5bb5910d..766255b2c 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/slice.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/slice.cpp @@ -138,6 +138,6 @@ LanguageUnit_p cuda::Slice::emit_dependency() } REGISTER_KERNEL_EMITTER( - "Slice", // op_name + "Slice", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::Slice) // constructor + cuda::Slice) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp index 0dba0aec4..c653abedc 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp @@ -132,9 +132,9 @@ LanguageUnit_p cuda::Softmax::emit_function_signature() } REGISTER_KERNEL_EMITTER( - "Softmax", // op_name + "Softmax", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs - cuda::Softmax) // constructor + cuda::Softmax) // constructor cuda::SoftmaxGrad::SoftmaxGrad(shared_ptr ctx) : CudaLibEmitter(ctx) @@ -266,6 +266,6 @@ LanguageUnit_p cuda::SoftmaxGrad::emit_function_signature() } REGISTER_KERNEL_EMITTER( - "SoftmaxGrad", // op_name + "SoftmaxGrad", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs cuda::SoftmaxGrad) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/sparse_apply_momentum.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/sparse_apply_momentum.cpp index 063716257..bbfac36eb 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/sparse_apply_momentum.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/sparse_apply_momentum.cpp @@ -126,6 +126,7 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("SparseApplyMomentum", - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::SparseApplyMomentum) +REGISTER_KERNEL_EMITTER( + "SparseApplyMomentum", + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), + cuda::SparseApplyMomentum) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/stop_gradient.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/stop_gradient.cpp index 6f7192da6..00cfe33fa 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/stop_gradient.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/stop_gradient.cpp @@ -115,6 +115,6 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( - "StopGradient", // op_name + "StopGradient", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::StopGradient) // constructor + cuda::StopGradient) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp index d796ef822..342edf949 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp @@ -113,6 +113,6 @@ LanguageUnit_p cuda::StridedSliceGrad::emit_dependency() return _lu; } REGISTER_KERNEL_EMITTER( - "StridedSliceGrad", // op_name + "StridedSliceGrad", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::StridedSliceGrad) // constructor \ No newline at end of file + cuda::StridedSliceGrad) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp index 4b22d7c25..33a869e71 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp @@ -146,10 +146,11 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("Tile", - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::Tile) +REGISTER_KERNEL_EMITTER( + "Tile", + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), + cuda::Tile) -REGISTER_KERNEL_EMITTER("Tile", //op_name +REGISTER_KERNEL_EMITTER("Tile", //op_name Device(ROCM_GPU).TypeConstraint(element::f32).Priority(2), //attrs - cuda::RocmTile) // constructor \ No newline at end of file + cuda::RocmTile) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/transpose.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/transpose.cpp index 7016e0518..824a799cf 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/transpose.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/transpose.cpp @@ -143,6 +143,6 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( - "Transpose", // op_name + "Transpose", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::Transpose) // constructor + cuda::Transpose) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/unsorted_segment_sum.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/unsorted_segment_sum.cpp index e816cd7b9..576136cd9 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/unsorted_segment_sum.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/unsorted_segment_sum.cpp @@ -185,6 +185,7 @@ atomicAdd(output0 + output_index, input0[tid]); using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("UnsortedSegmentSum", - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::UnsortedSegmentSum) +REGISTER_KERNEL_EMITTER( + "UnsortedSegmentSum", + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), + cuda::UnsortedSegmentSum) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp index 368e24241..419124649 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp @@ -77,6 +77,6 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("Variable", //op_name +REGISTER_KERNEL_EMITTER("Variable", //op_name Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs - cuda::Variable) // constructor \ No newline at end of file + cuda::Variable) // constructor \ No newline at end of file diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/zeros.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/zeros.cpp index 0a457b435..0ebc875a0 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/zeros.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/zeros.cpp @@ -58,6 +58,6 @@ namespace nnfusion using namespace nnfusion; using namespace nnfusion::kernels; -REGISTER_KERNEL_EMITTER("Zeros", // op_name +REGISTER_KERNEL_EMITTER("Zeros", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), // attrs - cuda::Zeros) // constructor + cuda::Zeros) // constructor diff --git a/src/nnfusion/core/kernels/kernel_registration.cpp b/src/nnfusion/core/kernels/kernel_registration.cpp index f18d75b7d..f527906fe 100644 --- a/src/nnfusion/core/kernels/kernel_registration.cpp +++ b/src/nnfusion/core/kernels/kernel_registration.cpp @@ -2,8 +2,8 @@ // Licensed under the MIT License. #include "kernel_registration.hpp" -#include "nnfusion/util/util.hpp" #include "ngraph/src/nnfusion/common/type/element_type.hpp" +#include "nnfusion/util/util.hpp" using namespace nnfusion; using namespace nnfusion::kernels; diff --git a/src/nnfusion/core/operators/generic_op/generic_op.hpp b/src/nnfusion/core/operators/generic_op/generic_op.hpp index 2a9f36171..f61552569 100644 --- a/src/nnfusion/core/operators/generic_op/generic_op.hpp +++ b/src/nnfusion/core/operators/generic_op/generic_op.hpp @@ -6,8 +6,8 @@ #include #include -#include "nnfusion/common/common.hpp" #include "ngraph/src/nnfusion/common/type/element_type.hpp" +#include "nnfusion/common/common.hpp" #define REGISTER_OP(op_x) \ static nnfusion::op::OpConfig __register_op_##op_x = nnfusion::op::build_op_config(#op_x) diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp old mode 100755 new mode 100644 index fd1c3b456..0aa467fd2 --- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp +++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp @@ -803,8 +803,8 @@ void CudaCodegenPass::create_header_file(std::shared_ptr ctx lu_header << declaration::typedef_int->get_code() << "\n"; if (device_type() == CUDA_GPU || device_type() == ROCM_GPU) lu_header << header::cuda->get_code(); - // TODO only include this if half is used - lu_header << header::cuda_fp16->get_code(); + // TODO only include this if half is used + lu_header << header::cuda_fp16->get_code(); lu_header << "extern \"C\" int kernel_entry("; std::string params = get_kernel_entry_paras(tu); diff --git a/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp b/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp index 1dc3b5521..a0dfaf000 100644 --- a/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp +++ b/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp @@ -887,7 +887,7 @@ bool BatchNormInferenceFoldingPass::run_on_graph(std::shared_ptrget_name(); + << graph->get_name(); for (auto pattern : BN_FOLDING_PATTERNS) { BatchNormInferenceOptimizer optimizer(graph, pattern); @@ -899,7 +899,7 @@ bool BatchNormInferenceFoldingPass::run_on_graph(std::shared_ptrget_name(); + << graph->get_name(); } return true; } \ No newline at end of file diff --git a/src/nnfusion/engine/pass/graph/kernel_selection.cpp b/src/nnfusion/engine/pass/graph/kernel_selection.cpp index 7b212ee2f..7216cacc1 100644 --- a/src/nnfusion/engine/pass/graph/kernel_selection.cpp +++ b/src/nnfusion/engine/pass/graph/kernel_selection.cpp @@ -26,7 +26,8 @@ pair IProfilingRuntime::Pointer runtime) { std::vector> kernel_regs = - KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, element::f32); + KernelRegistry::Global()->FindKernelRegistrations( + gnode->get_op_type(), devtype, element::f32); // Skip since only one candidate or constant if (kernel_regs.size() == 1 || gnode->is_constant()) @@ -143,7 +144,8 @@ pair { shared_ptr ctx(new KernelContext(gnode)); std::vector> kernel_regs = - KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, element::f32); + KernelRegistry::Global()->FindKernelRegistrations( + gnode->get_op_type(), devtype, element::f32); if (devtype == ROCM_GPU) { @@ -355,7 +357,8 @@ pair NNFusion_DeviceType devtype) { std::vector> kernel_regs = - KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, element::f32); + KernelRegistry::Global()->FindKernelRegistrations( + gnode->get_op_type(), devtype, element::f32); shared_ptr ctx(new KernelContext(gnode)); std::vector functions; diff --git a/src/nnfusion/engine/profiler/profiler.cpp b/src/nnfusion/engine/profiler/profiler.cpp index 174c3c108..0f4efbfab 100644 --- a/src/nnfusion/engine/profiler/profiler.cpp +++ b/src/nnfusion/engine/profiler/profiler.cpp @@ -82,7 +82,8 @@ void GraphEvaluate::create_profiling_contexts(shared_ptr gnode) return; } std::vector> kernel_regs = - KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), dev_type, element::f32); + KernelRegistry::Global()->FindKernelRegistrations( + gnode->get_op_type(), dev_type, element::f32); shared_ptr ctx(new KernelContext(gnode)); for (auto kernel_reg : kernel_regs) diff --git a/src/nnfusion/frontend/tensorflow_import/ops/const.cpp b/src/nnfusion/frontend/tensorflow_import/ops/const.cpp index 1d8f5e926..670303535 100644 --- a/src/nnfusion/frontend/tensorflow_import/ops/const.cpp +++ b/src/nnfusion/frontend/tensorflow_import/ops/const.cpp @@ -167,42 +167,62 @@ namespace nnfusion // int_val, float_val, etc. if (tensor_content_size == 0) { - -#define GET_VALUES(type) do { \ - const void* dat = nullptr; \ - for (size_t i = 0; i < n_elements; ++i) { \ - if (tensor.type##_val_size() == 1) { \ - dat = reinterpret_cast(&tensor.type##_val()[0]); \ - } else { \ - dat = reinterpret_cast(&tensor.type##_val()[i]); \ - } \ - values->setElement(i, dat); \ - } \ - } while(0) +#define GET_VALUES(type) \ + do \ + { \ + const void* dat = nullptr; \ + for (size_t i = 0; i < n_elements; ++i) \ + { \ + if (tensor.type##_val_size() == 1) \ + { \ + dat = reinterpret_cast(&tensor.type##_val()[0]); \ + } \ + else \ + { \ + dat = reinterpret_cast(&tensor.type##_val()[i]); \ + } \ + values->setElement(i, dat); \ + } \ + } while (0) values->resize(n_elements); auto& tensor = node.attr().at("value").tensor(); size_t val_size; - if (dt == tensorflow::DT_INT32) { + if (dt == tensorflow::DT_INT32) + { GET_VALUES(int); - } else if (dt == tensorflow::DT_INT64) { + } + else if (dt == tensorflow::DT_INT64) + { GET_VALUES(int64); - } else if (dt == tensorflow::DT_BOOL) { + } + else if (dt == tensorflow::DT_BOOL) + { GET_VALUES(bool); - } else if (dt == tensorflow::DT_HALF) { + } + else if (dt == tensorflow::DT_HALF) + { GET_VALUES(half); - } else if (dt == tensorflow::DT_FLOAT) { + } + else if (dt == tensorflow::DT_FLOAT) + { GET_VALUES(float); - } else if (dt == tensorflow::DT_DOUBLE) { + } + else if (dt == tensorflow::DT_DOUBLE) + { GET_VALUES(double); - } else if (dt == tensorflow::DT_STRING) { + } + else if (dt == tensorflow::DT_STRING) + { values->resize(tensor.string_val()[0].length()); auto it = tensor.string_val()[0].begin(); for (size_t j = 0; it != tensor.string_val()[0].end(); ++j, ++it) { values->setElement(j, reinterpret_cast(&it)); } - } else { + } + else + { return false; } diff --git a/test/nnfusion/engine/profiler.cpp b/test/nnfusion/engine/profiler.cpp index cffc74d1c..6709d1fdc 100644 --- a/test/nnfusion/engine/profiler.cpp +++ b/test/nnfusion/engine/profiler.cpp @@ -25,7 +25,8 @@ TEST(nnfusion_engine_profiler, basic_utils) // Filter out the kernels meeting the requirement; std::vector> kernel_regs = - KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), CUDA_GPU, element::f32); + KernelRegistry::Global()->FindKernelRegistrations( + gnode->get_op_type(), CUDA_GPU, element::f32); shared_ptr ctx(new KernelContext(gnode)); // Gnerate Test data diff --git a/test/nnfusion/kernels/sample.cpp b/test/nnfusion/kernels/sample.cpp index 367e6173c..e3cb109bf 100644 --- a/test/nnfusion/kernels/sample.cpp +++ b/test/nnfusion/kernels/sample.cpp @@ -24,7 +24,8 @@ TEST(nnfusion_core_kernels, sample) // Filter out the kernels meeting the requirement; std::vector> kernel_regs = - KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), CUDA_GPU, element::f32); + KernelRegistry::Global()->FindKernelRegistrations( + gnode->get_op_type(), CUDA_GPU, element::f32); shared_ptr ctx(new KernelContext(gnode)); EXPECT_GT(kernel_regs.size(), 0); From 1ede97278beb38cfec60cf591522525ced9ea333 Mon Sep 17 00:00:00 2001 From: Niupple Date: Wed, 2 Dec 2020 13:46:02 +0800 Subject: [PATCH 09/32] meet master --- .../engine/pass/graph/kernel_selection.cpp | 44 ------------------- 1 file changed, 44 deletions(-) diff --git a/src/nnfusion/engine/pass/graph/kernel_selection.cpp b/src/nnfusion/engine/pass/graph/kernel_selection.cpp index fa17819d5..cfe2992fd 100644 --- a/src/nnfusion/engine/pass/graph/kernel_selection.cpp +++ b/src/nnfusion/engine/pass/graph/kernel_selection.cpp @@ -419,49 +419,5 @@ bool FetchBasedSelector::run_on_graph(std::shared_ptr& g } } - return true; -} - -bool DefaultKernelSelector::register_antares_kernel() -{ - for (auto pair : nnfusion::op::get_op_configs()) - { - std::string op_name = pair.first; - std::vector devs{CUDA_GPU, GENERIC_CPU, HLSL}; - - KernelRegistrar kernel_registrar_cuda( - op_name, - Name(op_name) - .Device(CUDA_GPU) - .TypeConstraint(element::f32) - .Tag("antares") - .Priority(9) - .KernelFactory([](shared_ptr context) -> shared_ptr { - return make_shared(context); - }) - .Build()); - KernelRegistrar kernel_registrar_cpu( - op_name, - Name(op_name) - .Device(GENERIC_CPU) - .TypeConstraint(element::f32) - .Tag("antares") - .Priority(9) - .KernelFactory([](shared_ptr context) -> shared_ptr { - return make_shared(context); - }) - .Build()); - KernelRegistrar kernel_registrar_hlsl( - op_name, - Name(op_name) - .Device(HLSL) - .TypeConstraint(element::f32) - .Tag("antares") - .Priority(9) - .KernelFactory([](shared_ptr context) -> shared_ptr { - return make_shared(context); - }) - .Build()); - } return true; } \ No newline at end of file From a3d43f0740a1ec3c1f88f0de1930ea088ed9aa71 Mon Sep 17 00:00:00 2001 From: Niupple Date: Thu, 3 Dec 2020 19:48:53 +0800 Subject: [PATCH 10/32] fp16 runnable --- maint/script/build.sh | 2 +- src/nnfusion/core/kernels/common_langunit.cpp | 2 +- src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp | 7 +++++++ src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp | 1 + .../operators/generic_op/generic_op_define/Convolution.cpp | 3 ++- .../generic_op/generic_op_define/DepthToSpace.cpp | 2 +- .../generic_op/generic_op_define/DepthwiseConv2dNative.cpp | 3 ++- src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp | 1 + src/nnfusion/engine/pass/graph/kernel_tuning.cpp | 6 +++--- .../frontend/tensorflow_import/util/graph_convert.cpp | 3 +++ .../ngraph/src/nnfusion/core/operators/op_define/fused.cpp | 2 +- 11 files changed, 23 insertions(+), 9 deletions(-) diff --git a/maint/script/build.sh b/maint/script/build.sh index 7d113d7c9..657ed780b 100755 --- a/maint/script/build.sh +++ b/maint/script/build.sh @@ -36,7 +36,7 @@ fi # Make pushd $THIS_SCRIPT_DIR/../../build/ > /dev/null -make -j6 +make -j$(nproc) popd > /dev/null if [ $? -ne 0 ]; then diff --git a/src/nnfusion/core/kernels/common_langunit.cpp b/src/nnfusion/core/kernels/common_langunit.cpp index 0c8ed9bb0..cdf6ec226 100644 --- a/src/nnfusion/core/kernels/common_langunit.cpp +++ b/src/nnfusion/core/kernels/common_langunit.cpp @@ -24,7 +24,7 @@ LU_DEFINE(header::limits, "#include \n"); // Macro LU_DEFINE(macro::NNFUSION_DEBUG, "#define NNFUSION_DEBUG\n"); -LU_DEFINE(macro::MIN, "#define MIN(a,b) ((a)>(b)?(b):(a))\n") +LU_DEFINE(macro::MIN, "#define MIN(a,b) ((a)>(b)?(b):(a))\n"); // Declaration LU_DEFINE(declaration::typedef_int, diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp index ac8b9e90a..18334faac 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp @@ -17,6 +17,13 @@ LU_DEFINE(header::cuda_prof_api, "#include \n"); LU_DEFINE(header::cuda_fp16, "#include \n"); // Macro +LU_DEFINE(macro::HALF_MAX, + R"(#ifndef __HALF_COMPARE_EX__ +#define __HALF_COMPARE_EX__ +inline __device__ half max(half x, half y) { return x > y ? x : y; } +inline __device__ half min(half x, half y) { return x < y ? x : y; } +#endif)"); + LU_DEFINE( macro::CUDA_SAFE_CALL_NO_THROW, R"(#define CUDA_SAFE_CALL_NO_THROW(x) \ diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp index 93dbc0243..ae68e3e72 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp @@ -21,6 +21,7 @@ namespace nnfusion namespace macro { + LU_DECLARE(HALF_MAX); LU_DECLARE(CUDA_SAFE_CALL_NO_THROW); LU_DECLARE(CUDA_SAFE_CALL); LU_DECLARE(CUDNN_SAFE_CALL_NO_THROW); diff --git a/src/nnfusion/core/operators/generic_op/generic_op_define/Convolution.cpp b/src/nnfusion/core/operators/generic_op/generic_op_define/Convolution.cpp index 34c7f1f71..cd37a1696 100644 --- a/src/nnfusion/core/operators/generic_op/generic_op_define/Convolution.cpp +++ b/src/nnfusion/core/operators/generic_op/generic_op_define/Convolution.cpp @@ -57,7 +57,8 @@ REGISTER_OP(Convolution) { auto pad_template = ".when([-@pad_0@ + HO + KH >= 0, -@pad_0@ + HO + KH < @height@, -@pad_1@ + WO + KW " - ">= 0, -@pad_1@ + WO + KW < @width@], 0.0)"; + ">= 0, -@pad_1@ + WO + KW < @width@], " + "const(0.0).cast(@input0@@input0_layout@.dtype()))"; pad_cond = op::create_code_from_template(pad_template, config); } config["pad_cond"] = pad_cond; diff --git a/src/nnfusion/core/operators/generic_op/generic_op_define/DepthToSpace.cpp b/src/nnfusion/core/operators/generic_op/generic_op_define/DepthToSpace.cpp index 0de6b0d83..727ffe888 100644 --- a/src/nnfusion/core/operators/generic_op/generic_op_define/DepthToSpace.cpp +++ b/src/nnfusion/core/operators/generic_op/generic_op_define/DepthToSpace.cpp @@ -48,7 +48,7 @@ REGISTER_OP(DepthToSpace) }) .translate_v2([](std::shared_ptr curr) -> std::string { auto expression_template = - R"( temp0@mediate0_layout@ = @input0@@input0_layout@ @cond0@; temp1@mediate1_layout@ = temp0@mediate0_layout@; @output0@@output0_layout@ = temp1@mediate1o_layout@ @cond1@; ## @: plan/advance_fusion )"; + R"( temp0@mediate0_layout@ = @input0@@input0_layout@ @cond0@; temp1@mediate1_layout@ = temp0@mediate0_layout@; @output0@@output0_layout@ = temp1@mediate1o_layout@ @cond1@;)"; auto input_shape = curr->get_input_shape(0); auto _op = std::dynamic_pointer_cast(curr->get_op_ptr()); diff --git a/src/nnfusion/core/operators/generic_op/generic_op_define/DepthwiseConv2dNative.cpp b/src/nnfusion/core/operators/generic_op/generic_op_define/DepthwiseConv2dNative.cpp index 0d680d385..45690effc 100644 --- a/src/nnfusion/core/operators/generic_op/generic_op_define/DepthwiseConv2dNative.cpp +++ b/src/nnfusion/core/operators/generic_op/generic_op_define/DepthwiseConv2dNative.cpp @@ -103,7 +103,8 @@ REGISTER_OP(DepthwiseConv2dNative) { auto pad_template = ".when([-@pad_0@ + HO + KH >= 0, -@pad_0@ + HO + KH < @height@, -@pad_1@ + WO + KW " - ">= 0, -@pad_1@ + WO + KW < @width@], 0.0)"; + ">= 0, -@pad_1@ + WO + KW < @width@], " + "const(0.0).cast(@input0@@input0_layout@.dtype()))"; pad_cond = op::create_code_from_template(pad_template, config); } config["pad_cond"] = pad_cond; diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp index 9dbc3fc84..ce2397562 100644 --- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp +++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp @@ -161,6 +161,7 @@ void CudaCodegenPass::initialize(std::shared_ptr ctx, projgen->lup_codegen->require(macro::CUDA_SAFE_CALL); projgen->lup_codegen->require(macro::CUDNN_SAFE_CALL); projgen->lup_codegen->require(macro::CUBLAS_SAFE_CALL); + projgen->lup_codegen->require(macro::HALF_MAX); return; } diff --git a/src/nnfusion/engine/pass/graph/kernel_tuning.cpp b/src/nnfusion/engine/pass/graph/kernel_tuning.cpp index 00d3456ca..4a083ccc0 100644 --- a/src/nnfusion/engine/pass/graph/kernel_tuning.cpp +++ b/src/nnfusion/engine/pass/graph/kernel_tuning.cpp @@ -187,7 +187,7 @@ bool KernelTuning::register_antares_kernel() op_name, kernels::Name(op_name) .Device(CUDA_GPU) - .TypeConstraint(DT_FLOAT) + .TypeConstraint(element::f32) .Tag("antares") .Priority(9) .KernelFactory([](shared_ptr context) @@ -199,7 +199,7 @@ bool KernelTuning::register_antares_kernel() op_name, kernels::Name(op_name) .Device(GENERIC_CPU) - .TypeConstraint(DT_FLOAT) + .TypeConstraint(element::f32) .Tag("antares") .Priority(9) .KernelFactory([](shared_ptr context) @@ -211,7 +211,7 @@ bool KernelTuning::register_antares_kernel() op_name, kernels::Name(op_name) .Device(HLSL) - .TypeConstraint(DT_FLOAT) + .TypeConstraint(element::f32) .Tag("antares") .Priority(9) .KernelFactory([](shared_ptr context) diff --git a/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp b/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp index 257aba94c..17d3a12ee 100644 --- a/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp +++ b/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp @@ -59,6 +59,9 @@ namespace nnfusion config[entry.first] = "float32"; break; case ::tensorflow::DataType::DT_INT32: config[entry.first] = "int32"; break; + case ::tensorflow::DataType::DT_HALF: + config[entry.first] = "float16"; + break; default: NNFUSION_CHECK(false) << "Unrecognized data type: " << dtype; } } diff --git a/thirdparty/ngraph/src/nnfusion/core/operators/op_define/fused.cpp b/thirdparty/ngraph/src/nnfusion/core/operators/op_define/fused.cpp index a4425f60f..b0962174b 100644 --- a/thirdparty/ngraph/src/nnfusion/core/operators/op_define/fused.cpp +++ b/thirdparty/ngraph/src/nnfusion/core/operators/op_define/fused.cpp @@ -175,5 +175,5 @@ void Fused::register_ir2(std::vector>& gnodes) NNFUSION_LOG(INFO) << fused_op_ir2; // plan_rule = "## @: " + plan_rule; - plan_rule = "## @: plan/advance_fusion"; + plan_rule = ""; } From 7d1216008515881b2f311b9f2b646b7fbbbb4313 Mon Sep 17 00:00:00 2001 From: Niupple Date: Fri, 4 Dec 2020 14:19:15 +0800 Subject: [PATCH 11/32] fix macro newline --- src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp index 18334faac..ea429d047 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp @@ -22,7 +22,8 @@ LU_DEFINE(macro::HALF_MAX, #define __HALF_COMPARE_EX__ inline __device__ half max(half x, half y) { return x > y ? x : y; } inline __device__ half min(half x, half y) { return x < y ? x : y; } -#endif)"); +#endif +)"); LU_DEFINE( macro::CUDA_SAFE_CALL_NO_THROW, From f9209967feb535bc1628c403bf8ae3eddd1b1464 Mon Sep 17 00:00:00 2001 From: Niupple Date: Fri, 4 Dec 2020 15:11:15 +0800 Subject: [PATCH 12/32] check device type --- src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp index ce2397562..3875f582b 100644 --- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp +++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp @@ -805,7 +805,8 @@ void CudaCodegenPass::create_header_file(std::shared_ptr ctx if (device_type() == CUDA_GPU || device_type() == ROCM_GPU) lu_header << header::cuda->get_code(); // TODO only include this if half is used - lu_header << header::cuda_fp16->get_code(); + if (device_type() == CUDA_GPU) + lu_header << header::cuda_fp16->get_code(); lu_header << "extern \"C\" int kernel_entry("; std::string params = get_kernel_entry_paras(tu); From b71e6837de14c58406c626e08657fd9207074d3d Mon Sep 17 00:00:00 2001 From: Niupple Date: Mon, 7 Dec 2020 16:39:17 +0800 Subject: [PATCH 13/32] code stype --- src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp index 3875f582b..25f5586a1 100644 --- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp +++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp @@ -805,7 +805,7 @@ void CudaCodegenPass::create_header_file(std::shared_ptr ctx if (device_type() == CUDA_GPU || device_type() == ROCM_GPU) lu_header << header::cuda->get_code(); // TODO only include this if half is used - if (device_type() == CUDA_GPU) + if (device_type() == CUDA_GPU) lu_header << header::cuda_fp16->get_code(); lu_header << "extern \"C\" int kernel_entry("; From 76fe4fd76aaacb2541dffdb301b6f50cef072ca4 Mon Sep 17 00:00:00 2001 From: Niupple Date: Tue, 8 Dec 2020 15:26:34 +0800 Subject: [PATCH 14/32] fix ROCm unsupported LU's --- src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp | 2 +- src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp index 7a9bef553..acd1939b6 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp @@ -380,7 +380,7 @@ LanguageUnit_p cuda::Dot::emit_dependency() _lu->require(header::sstream); _lu->require(macro::CUBLAS_SAFE_CALL); _lu->require(macro::CUDA_SAFE_CALL); - _lu->require(declaration::cuda_fp16_scale); + // _lu->require(declaration::cuda_fp16_scale); //_lu->require(declaration::cublas_handle); return _lu; } diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp index 25f5586a1..dc535d584 100644 --- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp +++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp @@ -842,7 +842,7 @@ void CudaCodegenPass::create_main_file(std::shared_ptr ctx, re_main->require(header::limits); re_main->require(header::cuda_prof_api); - re_main->require(header::cuda_fp16); + // re_main->require(header::cuda_fp16); re_main->require(macro::CUDA_SAFE_CALL); lu_main << "#include \"nnfusion_rt.h\"\n"; From a7bf41e4c6bb00bd2fe43e35d0347c14b4ffcf58 Mon Sep 17 00:00:00 2001 From: Niupple Date: Fri, 11 Dec 2020 14:02:15 +0800 Subject: [PATCH 15/32] fix DataBuffer --- .../ngraph/src/nnfusion/core/operators/op_define/constant.hpp | 3 ++- .../src/nnfusion/frontend/onnx_import/util/graph_convert.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/thirdparty/ngraph/src/nnfusion/core/operators/op_define/constant.hpp b/thirdparty/ngraph/src/nnfusion/core/operators/op_define/constant.hpp index b6c2b0ee5..055be047a 100644 --- a/thirdparty/ngraph/src/nnfusion/core/operators/op_define/constant.hpp +++ b/thirdparty/ngraph/src/nnfusion/core/operators/op_define/constant.hpp @@ -109,8 +109,9 @@ namespace nnfusion << nnfusion::shape_size(m_shape) << ")."; DataBuffer buf(element_type); + size_t shape_size = nnfusion::shape_size(m_shape); - buf.loadFromStrings(values); + buf.loadFromStrings(values, shape_size); buf.dump(m_data); } diff --git a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/graph_convert.cpp b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/graph_convert.cpp index f6022d220..7dbb5a2c4 100644 --- a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/graph_convert.cpp +++ b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/graph_convert.cpp @@ -149,7 +149,7 @@ namespace nnfusion onnx::ModelProto proto_without_init; proto_without_init.CopyFrom(model_proto); proto_without_init.mutable_graph()->mutable_initializer()->Clear(); - NNFUSION_LOG(INFO) << proto_without_init.DebugString(); + // NNFUSION_LOG(INFO) << proto_without_init.DebugString(); } std::string From 1e42180f75a4ba979ef4b4878db1f1afe470dce6 Mon Sep 17 00:00:00 2001 From: Niupple Date: Fri, 11 Dec 2020 17:10:16 +0800 Subject: [PATCH 16/32] onnx changed to DataBuffer style import --- .../frontend/onnx_import/core/tensor.hpp | 50 ++---- .../frontend/onnx_import/util/util.cpp | 144 ++++++++++++++---- .../frontend/onnx_import/util/util.hpp | 5 +- 3 files changed, 133 insertions(+), 66 deletions(-) diff --git a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/core/tensor.hpp b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/core/tensor.hpp index 415abc080..85a624b14 100644 --- a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/core/tensor.hpp +++ b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/core/tensor.hpp @@ -22,6 +22,7 @@ #pragma once #include "../util/util.hpp" +#include "ngraph/src/nnfusion/common/type/data_buffer.hpp" namespace nnfusion { @@ -55,50 +56,31 @@ namespace nnfusion return detail::get_data(*m_tensor_proto); } + DataBuffer buffer_get_data() const + { + return detail::buffer_get_data(*m_tensor_proto); + } + const std::string& get_name() const { NNFUSION_CHECK(m_tensor_proto->has_name()) << "tensor has no name specified."; return m_tensor_proto->name(); } - const element::Type& get_ng_type() const + element::Type get_ng_type() const { NNFUSION_CHECK(m_tensor_proto->has_data_type()) << "tensor has no data type specified."; - switch (m_tensor_proto->data_type()) - { - case onnx::TensorProto_DataType::TensorProto_DataType_BOOL: - return element::boolean; - case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT: - case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16: - return element::f32; - case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE: - return element::f64; - case onnx::TensorProto_DataType::TensorProto_DataType_INT8: return element::i8; - case onnx::TensorProto_DataType::TensorProto_DataType_INT16: - return element::i16; - case onnx::TensorProto_DataType::TensorProto_DataType_INT32: - return element::i32; - case onnx::TensorProto_DataType::TensorProto_DataType_INT64: - return element::i64; - case onnx::TensorProto_DataType::TensorProto_DataType_UINT8: return element::u8; - case onnx::TensorProto_DataType::TensorProto_DataType_UINT16: - return element::u16; - case onnx::TensorProto_DataType::TensorProto_DataType_UINT32: - return element::u32; - case onnx::TensorProto_DataType::TensorProto_DataType_UINT64: - return element::u64; - case onnx::TensorProto_DataType::TensorProto_DataType_UNDEFINED: - NNFUSION_CHECK_FAIL() << "data type is not defined"; - break; - default: - NNFUSION_CHECK_FAIL() - << "unsupported data type: " - << onnx::TensorProto_DataType_Name( - onnx::TensorProto_DataType(m_tensor_proto->data_type())); - break; - } + element::Type element_type; + bool status; + status = ONNXDataTypeToNNFusionElementType( + static_cast(m_tensor_proto->data_type()), + &element_type); + NNFUSION_CHECK(status) << "Data type not supported: " + << m_tensor_proto->data_type(); + + return element_type; } operator onnx::TensorProto_DataType() const diff --git a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/util.cpp b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/util.cpp index d6f52653d..28a56620f 100644 --- a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/util.cpp +++ b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/util.cpp @@ -28,7 +28,7 @@ namespace nnfusion { namespace onnx_import { - bool ONNXDataTypeToNNFusionElementType(const onnx::TensorProto_DataType onnx_dt, + bool ONNXDataTypeToNNFusionElementType(onnx::TensorProto_DataType onnx_dt, nnfusion::element::Type* nnfusion_et) { switch (onnx_dt) @@ -36,8 +36,10 @@ namespace nnfusion case onnx::TensorProto_DataType::TensorProto_DataType_BOOL: *nnfusion_et = element::boolean; break; - case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT: case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16: + *nnfusion_et = element::f16; + break; + case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT: *nnfusion_et = element::f32; break; case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE: @@ -86,35 +88,38 @@ namespace nnfusion const Shape shape, const Tensor& tensor) { - switch (onnx_et) - { - case onnx::TensorProto_DataType::TensorProto_DataType_BOOL: - return make_constant_op(element::boolean, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT: - case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16: - return make_constant_op(element::f32, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE: - return make_constant_op(element::f64, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_INT8: - return make_constant_op(element::i8, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_INT16: - return make_constant_op(element::i16, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_INT32: - return make_constant_op(element::i32, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_INT64: - return make_constant_op(element::i64, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_UINT8: - return make_constant_op(element::u8, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_UINT16: - return make_constant_op(element::u16, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_UINT32: - return make_constant_op(element::u32, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_UINT64: - return make_constant_op(element::u64, shape, tensor); - default: - NNFUSION_CHECK_FAIL() << "unsupported value info element type: " - << onnx::TensorProto_DataType_Name(onnx_et); - } + element::Type element_type = tensor.get_ng_type(); + return std::make_shared( + element_type, shape, tensor.buffer_get_data()); + // switch (onnx_et) + // { + // case onnx::TensorProto_DataType::TensorProto_DataType_BOOL: + // return make_constant_op(element::boolean, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT: + // case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16: + // return make_constant_op(element::f32, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE: + // return make_constant_op(element::f64, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_INT8: + // return make_constant_op(element::i8, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_INT16: + // return make_constant_op(element::i16, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_INT32: + // return make_constant_op(element::i32, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_INT64: + // return make_constant_op(element::i64, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_UINT8: + // return make_constant_op(element::u8, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_UINT16: + // return make_constant_op(element::u16, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_UINT32: + // return make_constant_op(element::u32, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_UINT64: + // return make_constant_op(element::u64, shape, tensor); + // default: + // NNFUSION_CHECK_FAIL() << "unsupported value info element type: " + // << onnx::TensorProto_DataType_Name(onnx_et); + // } } std::shared_ptr GetInputNode(const NodeMap& all_ng_nodes, @@ -280,6 +285,83 @@ namespace nnfusion name, std::vector(kernel_shape.size(), 1UL)); } + DataBuffer detail::buffer_get_data(const onnx::TensorProto& tensor) + { + size_t n_element = 1; + element::Type type; + bool status; + auto onnx_dt = static_cast(tensor.data_type()); + + status = ONNXDataTypeToNNFusionElementType(onnx_dt, &type); + + NNFUSION_CHECK(status) << "Unsupported ONNX data_type " << tensor.data_type() + << " is found"; + + DataBuffer buf(type); + + for (auto dim : tensor.dims()) + { + n_element *= dim; + } + buf.resize(n_element); + + if (tensor.has_raw_data()) + { + buf.load(tensor.raw_data().data(), n_element); + } + else + { +#define GET_VALUE(pb_type, mid_type) \ + do \ + { \ + const void* dat; \ + mid_type m; \ + NNFUSION_CHECK(n_element == tensor.pb_type##_data_size()) \ + << "Tensor shape is not the same with tensor data_size. (" << n_element \ + << " != " << tensor.pb_type##_data_size() << ")"; \ + for (size_t i = 0; i < n_element; ++i) \ + { \ + m = static_cast(tensor.pb_type##_data()[i]); \ + dat = reinterpret_cast(&m); \ + buf.setElement(i, dat); \ + } \ + } while (0) + + switch (onnx_dt) + { + case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16: + GET_VALUE(int32, element::half); + break; + case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT: + GET_VALUE(float, float); + break; + case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE: + GET_VALUE(double, double); + break; + case onnx::TensorProto_DataType::TensorProto_DataType_INT32: + GET_VALUE(int32, int32_t); + break; + case onnx::TensorProto_DataType::TensorProto_DataType_INT64: + GET_VALUE(int64, int64_t); + break; + case onnx::TensorProto_DataType::TensorProto_DataType_UINT64: + GET_VALUE(uint64, uint64_t); + break; + case onnx::TensorProto_DataType::TensorProto_DataType_UINT32: + case onnx::TensorProto_DataType::TensorProto_DataType_BOOL: + case onnx::TensorProto_DataType::TensorProto_DataType_INT16: + case onnx::TensorProto_DataType::TensorProto_DataType_INT8: + case onnx::TensorProto_DataType::TensorProto_DataType_UINT8: + case onnx::TensorProto_DataType::TensorProto_DataType_UINT16: + default: + NNFUSION_CHECK_FAIL() << "unsupported onnx element type: " + << onnx::TensorProto_DataType_Name(onnx_dt); + } +#undef GET_VALUE + } + return buf; + } + } // namespace onnx_import } // namespace frontend } // namespace nnfusion diff --git a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/util.hpp b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/util.hpp index 871ab4801..31bf27726 100644 --- a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/util.hpp +++ b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/util.hpp @@ -28,6 +28,7 @@ #include #include "../onnx_base.hpp" +#include "ngraph/src/nnfusion/common/type/data_buffer.hpp" #include "nnfusion/common/common.hpp" namespace nnfusion @@ -51,6 +52,8 @@ namespace nnfusion return {it, it + (raw_data.size() / sizeof(T))}; } + DataBuffer buffer_get_data(const onnx::TensorProto& tensor); + template inline std::vector get_data(const onnx::TensorProto& tensor) { @@ -186,7 +189,7 @@ namespace nnfusion class Tensor; class Node; - bool ONNXDataTypeToNNFusionElementType(const onnx::TensorProto_DataType onnx_dt, + bool ONNXDataTypeToNNFusionElementType(onnx::TensorProto_DataType onnx_dt, nnfusion::element::Type* nnfusion_et); template From 18ab500f7be9333757128e57ae8e446e4809e8fc Mon Sep 17 00:00:00 2001 From: Niupple Date: Thu, 17 Dec 2020 11:22:50 +0800 Subject: [PATCH 17/32] fix onnx fp16 --- src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp | 4 ++-- .../src/nnfusion/frontend/onnx_import/op/constant.hpp | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp b/src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp index 76d40424d..42c782ae4 100644 --- a/src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp +++ b/src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp @@ -25,6 +25,6 @@ namespace nnfusion public: bool run_on_graph(std::shared_ptr& graph) override; }; - } // namespace pass - } // namespace graph + } // namespace graph + } // namespace pass } // namespace nnfusion diff --git a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/op/constant.hpp b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/op/constant.hpp index f3dd0bfc9..4280f9f86 100644 --- a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/op/constant.hpp +++ b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/op/constant.hpp @@ -65,8 +65,11 @@ namespace nnfusion Node node(node_proto); auto tensor = node.get_attribute_value("value"); - const auto& func_param = ONNX_CONST_MAP().at(tensor.get_ng_type()); - auto op = func_param(tensor.get_ng_type(), tensor); + // const auto& func_param = ONNX_CONST_MAP().at(tensor.get_ng_type()); + // auto op = func_param(tensor.get_ng_type(), tensor); + auto op = std::make_shared( + tensor.get_ng_type(), tensor.get_shape(), tensor.buffer_get_data() + ); op->set_name(node_proto.output(0)); auto gnode = m_graph->add_node_and_edge(op, graph::GNodeVector({})); From 9dd7529da14c999bc047305d87b8b9b42312eb87 Mon Sep 17 00:00:00 2001 From: Niupple Date: Tue, 29 Dec 2020 13:34:29 +0800 Subject: [PATCH 18/32] bert l1 runnable --- .../core/kernels/cuda_gpu/cuda_helper.cpp | 5 + .../kernels/cuda_gpu/kernels/batch_matmul.cpp | 18 +- .../core/kernels/cuda_gpu/kernels/dot.cpp | 190 +++++++++--------- .../core/kernels/cuda_gpu/kernels/reduce.hpp | 2 +- .../core/kernels/cuda_gpu/kernels/softmax.cpp | 7 +- .../engine/pass/graph/kernel_tuning.cpp | 6 + .../frontend/onnx_import/core/tensor.hpp | 2 +- .../frontend/onnx_import/op/constant.hpp | 3 +- .../frontend/onnx_import/ops_bridge.cpp | 1 + .../frontend/onnx_import/util/util.hpp | 2 +- 10 files changed, 133 insertions(+), 103 deletions(-) diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp index db9856623..c48c0e41d 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp @@ -33,6 +33,11 @@ LanguageUnit_p cuda::get_math_kernel(const std::string& name, writer << ")\n"; writer << "{\n"; writer.indent++; + if (name == "convert" && data_types[num_inputs] == "half" && data_types[0] == "int64_t") + { + writer << "return (long long)" + math_kernel << ";\n"; + } + else { writer << "return " + math_kernel << ";\n"; } diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp index 173e95e93..c42d7780c 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp @@ -8,6 +8,7 @@ // [a] ./new_kernel_0.cpp // [b] ../../../ops/op_define/new_op_0.cpp +#include #include "../cuda_emitter.hpp" #include "../cuda_langunit.hpp" #include "nnfusion/core/operators/generic_op/generic_op.hpp" @@ -52,6 +53,15 @@ namespace nnfusion const nnfusion::Shape& input_shape_0 = m_context->inputs[0]->get_shape(); const nnfusion::Shape& input_shape_1 = m_context->inputs[1]->get_shape(); + element::Type dtype0 = m_context->inputs[0]->get_element_type(); + element::Type dtype1 = m_context->inputs[1]->get_element_type(); + element::Type dtype2 = m_context->outputs[0]->get_element_type(); + NNFUSION_CHECK(dtype0 == dtype1 && dtype1 == dtype2) + << "Unsupported element type combination of (" << dtype0.c_type_string() + << ", " << dtype1.c_type_string() << ") -> " << dtype2.c_type_string() + << "."; + element::Type& dtype = dtype0; + bool transA = generic_op->localOpConfig.getRoot()["adj_x"]["b"]; bool transB = generic_op->localOpConfig.getRoot()["adj_y"]["b"]; size_t A1 = 1LU; @@ -92,10 +102,11 @@ namespace nnfusion stride_b = A2 * A3, ldc = A4, stride_c = A2 * A4; } + std::string type = dtype.c_type_string(); float alpha = 1.0f, beta = 0.0f; auto code = nnfusion::op::create_code_from_template( R"( - static const float alpha = @alpha@F, beta = @beta@F; + static const @dtype@ alpha = @alpha@, beta = @beta@; // if (!@hCublas@) // CUBLAS_SAFE_CALL(@api_create@(&@hCublas@)); CUBLAS_SAFE_CALL(@api_exec@( @@ -106,7 +117,9 @@ namespace nnfusion { {"hCublas", "cublas_handle"}, {"api_create", "cublasCreate"}, - {"api_exec", "cublasSgemmStridedBatched"}, + {"api_exec", + dtype == element::f32 ? "cublasSgemmStridedBatched" + : "cublasHgemmStridedBatched"}, {"transA", transB ? "CUBLAS_OP_T" : "CUBLAS_OP_N"}, {"transB", transA ? "CUBLAS_OP_T" : "CUBLAS_OP_N"}, {"alpha", alpha}, @@ -121,6 +134,7 @@ namespace nnfusion {"stride_b", stride_b}, {"stride_c", stride_c}, {"batch", A1}, + {"dtype", type}, }); LanguageUnit_p _lu(new LanguageUnit(get_function_name())); diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp index acd1939b6..24178ecb2 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp @@ -86,7 +86,7 @@ LanguageUnit_p cuda::Dot::emit_function_body() // matrix * vector else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1)) { - lu << "const float alpha = 1.0;\n const float beta = 0;\n"; + lu << "const float alpha = 1.0;\n const float beta = 0.;\n"; lu << "CUBLAS_SAFE_CALL(cublasSgemv(cublas_handle, "; if (trans_A) lu << "CUBLAS_OP_N, " << arg0_shape[0] << ", " << arg0_shape[1] << ", "; @@ -107,7 +107,7 @@ LanguageUnit_p cuda::Dot::emit_function_body() int n = trans_A ? arg0_shape[1] : arg0_shape[0]; int k = trans_A ? arg0_shape[0] : arg0_shape[1]; - lu << "const float alpha = 1.0;\nconst float beta = 0;\n"; + lu << "const float alpha = 1.0;\nconst float beta = 0.;\n"; lu << "CUBLAS_SAFE_CALL(cublasSgemm(cublas_handle," << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") @@ -186,7 +186,7 @@ LanguageUnit_p cuda::Dot::emit_function_body() } } - lu << "const float alpha = 1.0;\nconst float beta = 0;\n"; + lu << "const float alpha = 1.0;\nconst float beta = 0.;\n"; lu << "CUBLAS_SAFE_CALL(cublasSgemm(cublas_handle," << " CUBLAS_OP_N," @@ -261,111 +261,113 @@ LanguageUnit_p cuda::Dot::emit_function_body() // << " static_cast(output0)," // << " 1));\n"; // } - // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) && - // (trans_A || trans_B)) - // { - // int m = trans_B ? arg1_shape[0] : arg1_shape[1]; - // int n = trans_A ? arg0_shape[1] : arg0_shape[0]; - // int k = trans_A ? arg0_shape[0] : arg0_shape[1]; - - // lu << "const half alpha = 1.0;\nconst half beta = 0;\n"; - - // lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," - // << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") - // << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << "," - // << " " << n << "," - // << " " << k << "," - // << " &alpha," - // << " static_cast(input1)," - // << " " << arg1_shape[1] << "," - // << " static_cast(input0)," - // << " " << arg0_shape[1] << "," - // << " &beta," - // << " static_cast(output0)," - // << " " << m << "));\n"; - // } else { - size_t axes_for_m_count = arg0_shape.size() - reduction_axes; - size_t axes_for_n_count = arg1_shape.size() - reduction_axes; - size_t axes_for_k_count = reduction_axes; - size_t m = 1; - size_t n = 1; - size_t k = 1; - - // check if input and output size correct - // check and calculate k for arg0 and arg1 - size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k - size_t arg1_k_idx = 0; // first axe in arg1 for k - - for (size_t i = 0; i < axes_for_k_count; i++) + if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) && + (trans_A || trans_B)) { - k *= arg0_shape[arg0_k_idx]; - if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++]) - { - std::vector arg_vec{"arg0", "arg1"}; - std::vector shape_vec{arg0_shape, arg1_shape}; + int m = trans_B ? arg1_shape[0] : arg1_shape[1]; + int n = trans_A ? arg0_shape[1] : arg0_shape[0]; + int k = trans_A ? arg0_shape[0] : arg0_shape[1]; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; - } + lu << "const half alpha = 1.0;\nconst half beta = 0.;\n"; + + lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," + << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") + << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << "," + << " " << n << "," + << " " << k << "," + << " &alpha," + << " static_cast(input1)," + << " " << arg1_shape[1] << "," + << " static_cast(input0)," + << " " << arg0_shape[1] << "," + << " &beta," + << " static_cast(output0)," + << " " << m << "));\n"; } - // check and calculate m for arg0 and out - size_t arg0_m_idx = 0; // first axe in arg0 for m - size_t out_m_idx = 0; // first axe in out for m - for (size_t i = 0; i < axes_for_m_count; i++) + else { - m *= arg0_shape[arg0_m_idx]; - if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++]) + size_t axes_for_m_count = arg0_shape.size() - reduction_axes; + size_t axes_for_n_count = arg1_shape.size() - reduction_axes; + size_t axes_for_k_count = reduction_axes; + size_t m = 1; + size_t n = 1; + size_t k = 1; + + // check if input and output size correct + // check and calculate k for arg0 and arg1 + size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k + size_t arg1_k_idx = 0; // first axe in arg1 for k + + for (size_t i = 0; i < axes_for_k_count; i++) { - std::vector arg_vec{"arg0", "output"}; - std::vector shape_vec{arg0_shape, out_shape}; + k *= arg0_shape[arg0_k_idx]; + if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++]) + { + std::vector arg_vec{"arg0", "arg1"}; + std::vector shape_vec{arg0_shape, arg1_shape}; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } } - } - // check and calculate n for arg1 and out - size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n - size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n - for (size_t i = 0; i < axes_for_n_count; i++) - { - n *= arg1_shape[arg1_n_idx]; - if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++]) + // check and calculate m for arg0 and out + size_t arg0_m_idx = 0; // first axe in arg0 for m + size_t out_m_idx = 0; // first axe in out for m + for (size_t i = 0; i < axes_for_m_count; i++) { - std::vector arg_vec{"arg1", "output"}; - std::vector shape_vec{arg1_shape, out_shape}; + m *= arg0_shape[arg0_m_idx]; + if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++]) + { + std::vector arg_vec{"arg0", "output"}; + std::vector shape_vec{arg0_shape, out_shape}; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } } - } + // check and calculate n for arg1 and out + size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n + size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n + for (size_t i = 0; i < axes_for_n_count; i++) + { + n *= arg1_shape[arg1_n_idx]; + if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++]) + { + std::vector arg_vec{"arg1", "output"}; + std::vector shape_vec{arg1_shape, out_shape}; - lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; - - lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," - << " CUBLAS_OP_N," - << " CUBLAS_OP_N," - << " " << n << "," - << " " << m << "," - << " " << k << "," - << " &alpha," - << " static_cast(input1)," - << " " << n << "," - << " static_cast(input0)," - << " " << k << "," - << " &beta," - << " static_cast(output0)," - << " " << n << "));\n"; - // } + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } + } + + lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; + + lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," + << " CUBLAS_OP_N," + << " CUBLAS_OP_N," + << " " << n << "," + << " " << m << "," + << " " << k << "," + << " &alpha," + << " static_cast(input1)," + << " " << n << "," + << " static_cast(input0)," + << " " << k << "," + << " &beta," + << " static_cast(output0)," + << " " << n << "));\n"; + } } else { - NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot."; + NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for kernel dot."; } //lu.block_end(); return _lu; diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.hpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.hpp index 97353e5e8..c9bfb3c26 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.hpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.hpp @@ -189,7 +189,7 @@ int data_idx_offset = block_idx * width; float val = 0.0; for (int tidx = thread_idx; tidx < width; tidx += block_size) { int data_idx = tidx + data_idx_offset; - val += input0[data_idx]; + val += static_cast(input0[data_idx]); } val = reduceSum(val, thread_idx, block_size, shm); if (thread_idx == 0) output0[block_idx] = val; diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp index c653abedc..78e7af632 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include "softmax.hpp" +#include "../cuda_cudnn.hpp" #include "nnfusion/core/operators/generic_op/generic_op.hpp" using namespace nnfusion; @@ -22,7 +23,8 @@ LanguageUnit_p { LanguageUnit_p _lu(new LanguageUnit); auto& lu = *_lu; - string data_type = "CUDNN_DATA_FLOAT"; //cuda::get_cudnn_datatype(type); + element::Type type = m_context->inputs[0]->get_element_type(); + string data_type = cuda::get_cudnn_datatype(type); string tensor_format = "CUDNN_TENSOR_NCHW"; lu << "cudnnTensorDescriptor_t " << desc << ";\n"; lu << "CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&" << desc << "));\n"; @@ -154,7 +156,8 @@ LanguageUnit_p { LanguageUnit_p _lu(new LanguageUnit); auto& lu = *_lu; - string data_type = "CUDNN_DATA_FLOAT"; //cuda::get_cudnn_datatype(type); + element::Type type = m_context->inputs[0]->get_element_type(); + string data_type = cuda::get_cudnn_datatype(type); string tensor_format = "CUDNN_TENSOR_NCHW"; lu << "cudnnTensorDescriptor_t " << desc << ";\n"; lu << "CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&" << desc << "));\n"; diff --git a/src/nnfusion/engine/pass/graph/kernel_tuning.cpp b/src/nnfusion/engine/pass/graph/kernel_tuning.cpp index 9fa4ae1bc..5132845c3 100644 --- a/src/nnfusion/engine/pass/graph/kernel_tuning.cpp +++ b/src/nnfusion/engine/pass/graph/kernel_tuning.cpp @@ -91,6 +91,12 @@ void print_tuning_results(std::vector> tuned_kerne << std::setw(10) << s->status << " | " << std::setw(6) << s->progress_step << "/" << FLAGS_fkernel_tuning_steps << " " << " | " << std::setw(12) << s->best_perf << " ms |\n"; + + if (fabs(s->best_perf + 1.0) < 1e-5) + { + NNFUSION_LOG(INFO) << "Kernel named \"" << s->op_name << "\" has not yet been tuned.\n" + << s->ir; + } } NNFUSION_LOG(INFO) << ss.str(); } diff --git a/src/nnfusion/frontend/onnx_import/core/tensor.hpp b/src/nnfusion/frontend/onnx_import/core/tensor.hpp index 85a624b14..201aa580d 100644 --- a/src/nnfusion/frontend/onnx_import/core/tensor.hpp +++ b/src/nnfusion/frontend/onnx_import/core/tensor.hpp @@ -22,7 +22,7 @@ #pragma once #include "../util/util.hpp" -#include "ngraph/src/nnfusion/common/type/data_buffer.hpp" +#include "nnfusion/common/type/data_buffer.hpp" namespace nnfusion { diff --git a/src/nnfusion/frontend/onnx_import/op/constant.hpp b/src/nnfusion/frontend/onnx_import/op/constant.hpp index 4280f9f86..87163617d 100644 --- a/src/nnfusion/frontend/onnx_import/op/constant.hpp +++ b/src/nnfusion/frontend/onnx_import/op/constant.hpp @@ -68,8 +68,7 @@ namespace nnfusion // const auto& func_param = ONNX_CONST_MAP().at(tensor.get_ng_type()); // auto op = func_param(tensor.get_ng_type(), tensor); auto op = std::make_shared( - tensor.get_ng_type(), tensor.get_shape(), tensor.buffer_get_data() - ); + tensor.get_ng_type(), tensor.get_shape(), tensor.buffer_get_data()); op->set_name(node_proto.output(0)); auto gnode = m_graph->add_node_and_edge(op, graph::GNodeVector({})); diff --git a/src/nnfusion/frontend/onnx_import/ops_bridge.cpp b/src/nnfusion/frontend/onnx_import/ops_bridge.cpp index d63fbf34a..9724b435a 100644 --- a/src/nnfusion/frontend/onnx_import/ops_bridge.cpp +++ b/src/nnfusion/frontend/onnx_import/ops_bridge.cpp @@ -134,6 +134,7 @@ namespace nnfusion REGISTER_EMPTY_DOMAIN("ai.onnx.ml"); REGISTER_EMPTY_DOMAIN("com.microsoft"); REGISTER_EMPTY_DOMAIN("com.microsoft.mlfeaturizers"); + REGISTER_EMPTY_DOMAIN("ai.onnx.preview.training"); REGISTER_OPERATOR("Abs", 1, TranslateUnaryOp); REGISTER_OPERATOR("Acos", 1, TranslateUnaryOp); REGISTER_OPERATOR("AdamOptimizer", 1, TranslateAdamOptimizerOp); diff --git a/src/nnfusion/frontend/onnx_import/util/util.hpp b/src/nnfusion/frontend/onnx_import/util/util.hpp index 31bf27726..2f7262842 100644 --- a/src/nnfusion/frontend/onnx_import/util/util.hpp +++ b/src/nnfusion/frontend/onnx_import/util/util.hpp @@ -28,8 +28,8 @@ #include #include "../onnx_base.hpp" -#include "ngraph/src/nnfusion/common/type/data_buffer.hpp" #include "nnfusion/common/common.hpp" +#include "nnfusion/common/type/data_buffer.hpp" namespace nnfusion { From f015bc62893d398ba52e398ddc6a25e436b3426f Mon Sep 17 00:00:00 2001 From: Niupple Date: Tue, 5 Jan 2021 17:07:37 +0800 Subject: [PATCH 19/32] change priority of evaluator runtime --- src/nnfusion/frontend/util/evaluator.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/nnfusion/frontend/util/evaluator.hpp b/src/nnfusion/frontend/util/evaluator.hpp index 1e5a56e36..9edbc43c0 100644 --- a/src/nnfusion/frontend/util/evaluator.hpp +++ b/src/nnfusion/frontend/util/evaluator.hpp @@ -105,21 +105,21 @@ namespace nnfusion nnfusion::profiler::IProfilingRuntime::Pointer runtime = nullptr; std::vector> kernel_regs; - runtime = nnfusion::profiler::RocmDefaultRuntime::Runtime(); + runtime = nnfusion::profiler::CudaDefaultRuntime::Runtime(); if (runtime->check_env()) { kernel_regs = KernelRegistry::Global()->FindKernelRegistrations( - gnode->get_op_type(), ROCM_GPU, element::f32); - if (kernel_regs.size() == 0) - kernel_regs = KernelRegistry::Global()->FindKernelRegistrations( - gnode->get_op_type(), CUDA_GPU, element::f32); + gnode->get_op_type(), CUDA_GPU, element::f32); } else { - runtime = nnfusion::profiler::CudaDefaultRuntime::Runtime(); + runtime = nnfusion::profiler::RocmDefaultRuntime::Runtime(); NNFUSION_CHECK(runtime->check_env()); kernel_regs = KernelRegistry::Global()->FindKernelRegistrations( - gnode->get_op_type(), CUDA_GPU, element::f32); + gnode->get_op_type(), ROCM_GPU, element::f32); + if (kernel_regs.size() == 0) + kernel_regs = KernelRegistry::Global()->FindKernelRegistrations( + gnode->get_op_type(), CUDA_GPU, element::f32); } bool const_infer_success = false; From 86ccb2d1ee99723a96f641d1a1e9eed74fbff2d8 Mon Sep 17 00:00:00 2001 From: Niupple Date: Wed, 25 Nov 2020 12:42:56 +0000 Subject: [PATCH 20/32] changes for fp16 --- .../core/kernels/cpu/reference/constant.cpp | 2 +- .../core/kernels/cpu/reference/variable.cpp | 2 +- .../kernels/cuda_gpu/kernels/apply_adam.cpp | 2 +- .../kernels/cuda_gpu/kernels/constant.cpp | 2 +- .../kernels/cuda_gpu/kernels/convolution.cpp | 1 + .../core/kernels/cuda_gpu/kernels/dot.cpp | 80 +++++++++++++++++++ .../cuda_gpu/kernels/dynamic_stitch.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/pad.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/range.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/result.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/reverse.cpp | 2 +- .../cuda_gpu/kernels/reverse_sequence.cpp | 2 +- .../cuda_gpu/kernels/strided_slice_grad.cpp | 2 +- .../core/kernels/cuda_gpu/kernels/tile.cpp | 2 +- .../kernels/cuda_gpu/kernels/variable.cpp | 2 +- .../core/kernels/kernel_registration.cpp | 1 + 16 files changed, 95 insertions(+), 13 deletions(-) diff --git a/src/nnfusion/core/kernels/cpu/reference/constant.cpp b/src/nnfusion/core/kernels/cpu/reference/constant.cpp index 7917d10ad..b922d2cee 100644 --- a/src/nnfusion/core/kernels/cpu/reference/constant.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/constant.cpp @@ -71,4 +71,4 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Constant", //op_name Device(GENERIC_CPU).TypeConstraint(element::f32), //attrs - cpu::Constant) // constructor \ No newline at end of file + cpu::Constant) // constructor diff --git a/src/nnfusion/core/kernels/cpu/reference/variable.cpp b/src/nnfusion/core/kernels/cpu/reference/variable.cpp index 5e16388f6..a4eeeea2b 100644 --- a/src/nnfusion/core/kernels/cpu/reference/variable.cpp +++ b/src/nnfusion/core/kernels/cpu/reference/variable.cpp @@ -69,4 +69,4 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Variable", //op_name Device(GENERIC_CPU).TypeConstraint(element::f32), //attrs - cpu::Variable) // constructor \ No newline at end of file + cpu::Variable) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp index e42e0eda9..44308b801 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp @@ -108,4 +108,4 @@ using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER( "ApplyAdam", Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), - cuda::ApplyAdam) \ No newline at end of file + cuda::ApplyAdam) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp index 73f04f5fd..d43bcca07 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp @@ -121,4 +121,4 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Constant", //op_name Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs - cuda::Constant) // constructor \ No newline at end of file + cuda::Constant) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp index 6b91e3956..549d428c6 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp @@ -87,6 +87,7 @@ LanguageUnit_p cuda::ConvolutionCudnn::emit_function_body() padding_below[i] = static_cast(padding_below_diff[i]); } + { // lu << "cudnnDataType_t data_type = " << get_cudnn_datatype(dtype) << ";\n"; lu << cudnn_tensor_descriptor_from_shape(input_shape, "tensor_desc_0", input_type) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp index acd1939b6..23e7761cf 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp @@ -203,6 +203,86 @@ LanguageUnit_p cuda::Dot::emit_function_body() << " static_cast(output0)," << " " << n << "));\n"; } + } else if (dtype == element::f16) { + size_t axes_for_m_count = arg0_shape.size() - reduction_axes; + size_t axes_for_n_count = arg1_shape.size() - reduction_axes; + size_t axes_for_k_count = reduction_axes; + size_t m = 1; + size_t n = 1; + size_t k = 1; + + // check if input and output size correct + // check and calculate k for arg0 and arg1 + size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k + size_t arg1_k_idx = 0; // first axe in arg1 for k + + for (size_t i = 0; i < axes_for_k_count; i++) + { + k *= arg0_shape[arg0_k_idx]; + if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++]) + { + std::vector arg_vec{"arg0", "arg1"}; + std::vector shape_vec{arg0_shape, arg1_shape}; + + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } + } + // check and calculate m for arg0 and out + size_t arg0_m_idx = 0; // first axe in arg0 for m + size_t out_m_idx = 0; // first axe in out for m + for (size_t i = 0; i < axes_for_m_count; i++) + { + m *= arg0_shape[arg0_m_idx]; + if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++]) + { + std::vector arg_vec{"arg0", "output"}; + std::vector shape_vec{arg0_shape, out_shape}; + + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } + } + // check and calculate n for arg1 and out + size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n + size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n + for (size_t i = 0; i < axes_for_n_count; i++) + { + n *= arg1_shape[arg1_n_idx]; + if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++]) + { + std::vector arg_vec{"arg1", "output"}; + std::vector shape_vec{arg1_shape, out_shape}; + + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } + } + + lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; + + lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," + << " CUBLAS_OP_N," + << " CUBLAS_OP_N," + << " " << n << "," + << " " << m << "," + << " " << k << "," + << " &alpha," + << " static_cast(input1)," + << " " << n << "," + << " static_cast(input0)," + << " " << k << "," + << " &beta," + << " static_cast(output0)," + << " " << n << "));\n"; + } else { + NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot." } else if (dtype == element::f16) { diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp index 4bd847949..00cd81136 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp @@ -124,4 +124,4 @@ LanguageUnit_p cuda::DynamicStitch::emit_dependency() REGISTER_KERNEL_EMITTER( "DynamicStitch", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::DynamicStitch) // constructor \ No newline at end of file + cuda::DynamicStitch) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp index faab94fe9..733c037e4 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp @@ -149,4 +149,4 @@ KernelRegistrar kernel_registrar0( REGISTER_KERNEL_EMITTER( "Pad", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::Pad) // constructor \ No newline at end of file + cuda::Pad) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp index 1c5a30279..f7a06a159 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp @@ -65,4 +65,4 @@ LanguageUnit_p cuda::Range::emit_dependency() REGISTER_KERNEL_EMITTER( "Range", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::Range) // constructor \ No newline at end of file + cuda::Range) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp index 229580e6a..73b819fd7 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp @@ -90,4 +90,4 @@ LanguageUnit_p cuda::Result::emit_dependency() REGISTER_KERNEL_EMITTER( "Result", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs - cuda::Result) // constructor \ No newline at end of file + cuda::Result) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp index 6d5fc374d..e3be51ffc 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp @@ -103,4 +103,4 @@ LanguageUnit_p cuda::Reverse::emit_dependency() REGISTER_KERNEL_EMITTER( "Reverse", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::Reverse) // constructor \ No newline at end of file + cuda::Reverse) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp index 487951930..612c51730 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp @@ -130,4 +130,4 @@ REGISTER_KERNEL_EMITTER( REGISTER_KERNEL_EMITTER("ReverseSequence", // op_name Device(ROCM_GPU).TypeConstraint(element::f32).Priority(2), // attrs - cuda::RocmReverseSequence) // constructor \ No newline at end of file + cuda::RocmReverseSequence) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp index 342edf949..ab27b0ec7 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp @@ -115,4 +115,4 @@ LanguageUnit_p cuda::StridedSliceGrad::emit_dependency() REGISTER_KERNEL_EMITTER( "StridedSliceGrad", // op_name Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs - cuda::StridedSliceGrad) // constructor \ No newline at end of file + cuda::StridedSliceGrad) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp index 33a869e71..6dc5220d9 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp @@ -153,4 +153,4 @@ REGISTER_KERNEL_EMITTER( REGISTER_KERNEL_EMITTER("Tile", //op_name Device(ROCM_GPU).TypeConstraint(element::f32).Priority(2), //attrs - cuda::RocmTile) // constructor \ No newline at end of file + cuda::RocmTile) // constructor diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp index 419124649..80c5cc707 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp @@ -79,4 +79,4 @@ using namespace nnfusion; using namespace nnfusion::kernels; REGISTER_KERNEL_EMITTER("Variable", //op_name Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs - cuda::Variable) // constructor \ No newline at end of file + cuda::Variable) // constructor diff --git a/src/nnfusion/core/kernels/kernel_registration.cpp b/src/nnfusion/core/kernels/kernel_registration.cpp index 67ded5b8f..9aba33df2 100644 --- a/src/nnfusion/core/kernels/kernel_registration.cpp +++ b/src/nnfusion/core/kernels/kernel_registration.cpp @@ -4,6 +4,7 @@ #include "kernel_registration.hpp" #include "nnfusion/common/type/element_type.hpp" #include "nnfusion/util/util.hpp" +#include "ngraph/src/nnfusion/common/type/element_type.hpp" using namespace nnfusion; using namespace nnfusion::kernels; From e9fa2861d15546c376c9d3a0c61af81849d4f448 Mon Sep 17 00:00:00 2001 From: Niupple Date: Fri, 27 Nov 2020 09:00:51 +0000 Subject: [PATCH 21/32] vgg11 runnable --- .../core/kernels/cuda_gpu/kernels/dot.cpp | 210 ++++--- .../pass/graph/codegen_dxcompute_pass.hpp | 533 ++++++++++++++++++ 2 files changed, 678 insertions(+), 65 deletions(-) create mode 100644 src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp index 23e7761cf..7838e4300 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp @@ -204,83 +204,163 @@ LanguageUnit_p cuda::Dot::emit_function_body() << " " << n << "));\n"; } } else if (dtype == element::f16) { - size_t axes_for_m_count = arg0_shape.size() - reduction_axes; - size_t axes_for_n_count = arg1_shape.size() - reduction_axes; - size_t axes_for_k_count = reduction_axes; - size_t m = 1; - size_t n = 1; - size_t k = 1; + // case 1: Scalar * Tensor + // if (arg0_shape.empty() || arg1_shape.empty()) + // { + // auto& second = (arg0_shape.empty() ? arg1_shape : arg0_shape); + // size_t count = nnfusion::shape_size(second); - // check if input and output size correct - // check and calculate k for arg0 and arg1 - size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k - size_t arg1_k_idx = 0; // first axe in arg1 for k + // string firstarg = (arg0_shape.empty() ? "input1" : "input0"); + // string secondarg = (arg0_shape.empty() ? "input0" : "input1"); - for (size_t i = 0; i < axes_for_k_count; i++) - { - k *= arg0_shape[arg0_k_idx]; - if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++]) + // lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n"; + + // lu << "CUDA_SAFE_CALL(cudaMemcpy(outupt0, " << firstarg << ", " << count << ", cudaMemcpyDeviceToDevice));\n"; // copy `firstarg` to `output0` + // lu << "CUBLAS_SAFE_CALL(nnfusionHalfScale(" << secondarg << ", output0, " << count << "));\n"; + // } + // // case 2: 1d Dot + // else if ((arg0_shape.size() == arg1_shape.size()) && (arg0_shape.size() == reduction_axes)) + // { + // for (int i = 0; i < arg0_shape.size(); i++) + // { + // if (arg0_shape[i] != arg1_shape[i]) + // { + // std::vector arg_vec{"arg0", "arg1"}; + // std::vector shape_vec{arg0_shape, arg1_shape}; + + // NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + // << nnfusion::join(shape_vec) << " respectively, at Node " + // << m_context->gnode->get_name() + // << ", do not match for dot op"; + // } + // } + + // size_t count = nnfusion::shape_size(arg0_shape); + // lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n"; + + // lu << "CUBLAS_SAFE_CALL(cublasSdot(cublas_handle, " << count + // << ", static_cast(input0), 1, static_cast(input1), 1, " + // "static_cast(output0)));\n"; + // } + // // matrix * vector + // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1)) + // { + // lu << "const float alpha = 1.0;\n const float beta = 0;\n"; + // lu << "CUBLAS_SAFE_CALL(cublasSgemv(cublas_handle, "; + // if (trans_A) + // lu << "CUBLAS_OP_N, " << arg0_shape[0] << ", " << arg0_shape[1] << ", "; + // else + // lu << "CUBLAS_OP_T, " << arg0_shape[1] << ", " << arg0_shape[0] << ", "; + // lu << " &alpha," + // << " static_cast(input0)," << arg0_shape[1] << ", " + // << " static_cast(input1)," + // << " 1," + // << " &beta," + // << " static_cast(output0)," + // << " 1));\n"; + // } + // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) && + // (trans_A || trans_B)) + // { + // int m = trans_B ? arg1_shape[0] : arg1_shape[1]; + // int n = trans_A ? arg0_shape[1] : arg0_shape[0]; + // int k = trans_A ? arg0_shape[0] : arg0_shape[1]; + + // lu << "const half alpha = 1.0;\nconst half beta = 0;\n"; + + // lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," + // << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") + // << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << "," + // << " " << n << "," + // << " " << k << "," + // << " &alpha," + // << " static_cast(input1)," + // << " " << arg1_shape[1] << "," + // << " static_cast(input0)," + // << " " << arg0_shape[1] << "," + // << " &beta," + // << " static_cast(output0)," + // << " " << m << "));\n"; + // } else { + size_t axes_for_m_count = arg0_shape.size() - reduction_axes; + size_t axes_for_n_count = arg1_shape.size() - reduction_axes; + size_t axes_for_k_count = reduction_axes; + size_t m = 1; + size_t n = 1; + size_t k = 1; + + // check if input and output size correct + // check and calculate k for arg0 and arg1 + size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k + size_t arg1_k_idx = 0; // first axe in arg1 for k + + for (size_t i = 0; i < axes_for_k_count; i++) { - std::vector arg_vec{"arg0", "arg1"}; - std::vector shape_vec{arg0_shape, arg1_shape}; + k *= arg0_shape[arg0_k_idx]; + if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++]) + { + std::vector arg_vec{"arg0", "arg1"}; + std::vector shape_vec{arg0_shape, arg1_shape}; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } } - } - // check and calculate m for arg0 and out - size_t arg0_m_idx = 0; // first axe in arg0 for m - size_t out_m_idx = 0; // first axe in out for m - for (size_t i = 0; i < axes_for_m_count; i++) - { - m *= arg0_shape[arg0_m_idx]; - if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++]) + // check and calculate m for arg0 and out + size_t arg0_m_idx = 0; // first axe in arg0 for m + size_t out_m_idx = 0; // first axe in out for m + for (size_t i = 0; i < axes_for_m_count; i++) { - std::vector arg_vec{"arg0", "output"}; - std::vector shape_vec{arg0_shape, out_shape}; + m *= arg0_shape[arg0_m_idx]; + if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++]) + { + std::vector arg_vec{"arg0", "output"}; + std::vector shape_vec{arg0_shape, out_shape}; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } } - } - // check and calculate n for arg1 and out - size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n - size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n - for (size_t i = 0; i < axes_for_n_count; i++) - { - n *= arg1_shape[arg1_n_idx]; - if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++]) + // check and calculate n for arg1 and out + size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n + size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n + for (size_t i = 0; i < axes_for_n_count; i++) { - std::vector arg_vec{"arg1", "output"}; - std::vector shape_vec{arg1_shape, out_shape}; + n *= arg1_shape[arg1_n_idx]; + if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++]) + { + std::vector arg_vec{"arg1", "output"}; + std::vector shape_vec{arg1_shape, out_shape}; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } } - } - - lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; - lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," - << " CUBLAS_OP_N," - << " CUBLAS_OP_N," - << " " << n << "," - << " " << m << "," - << " " << k << "," - << " &alpha," - << " static_cast(input1)," - << " " << n << "," - << " static_cast(input0)," - << " " << k << "," - << " &beta," - << " static_cast(output0)," - << " " << n << "));\n"; + lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; + + lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," + << " CUBLAS_OP_N," + << " CUBLAS_OP_N," + << " " << n << "," + << " " << m << "," + << " " << k << "," + << " &alpha," + << " static_cast(input1)," + << " " << n << "," + << " static_cast(input0)," + << " " << k << "," + << " &beta," + << " static_cast(output0)," + << " " << n << "));\n"; + // } + } else { NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot." } diff --git a/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp b/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp new file mode 100644 index 000000000..1779ad827 --- /dev/null +++ b/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp @@ -0,0 +1,533 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#pragma once + +#include "graph_pass_base.hpp" +#include "nnfusion/core/operators/generic_op/generic_op.hpp" +#include "nnfusion/core/operators/op_define/constant.hpp" +#include "nnfusion/engine/profiler/profiler.hpp" +#include "nnfusion/util/curl_request.hpp" + +using namespace nnfusion::graph; + +DECLARE_string(fdefault_device); +DECLARE_string(fantares_codegen_server); + +namespace nnfusion +{ + namespace pass + { + namespace graph + { + class DirectComputeCodegenPass : public GraphPassBase + { + std::string currentBackend; + std::string autogen(const std::string& expr) + { + if (FLAGS_fantares_codegen_server == "") + FLAGS_fantares_codegen_server = "10.150.145.98:8884"; + static std::unordered_map code_cache; + std::string response; + auto it = code_cache.find(expr); + if (it == code_cache.end()) + { + CurlRequest req(FLAGS_fantares_codegen_server); + req.add_custom_header(("COMPUTE_V1: " + expr).c_str()); + req.add_custom_header("ARGS: "); + + printf("[Autogen] %s\n", expr.c_str()); + NNFUSION_CHECK(true == req.send_request(response)); + NNFUSION_CHECK(strncmp(response.c_str(), "[ERROR]", 7) != 0) << expr << "\n" + << response; + code_cache[expr] = response; + return std::move(response); + } + else + return it->second; + } + + template + inline std::string + join_collections(const T1& vect, T2 func, bool skip_empty = false) + { + std::stringstream result; + int idx = 0; + for (auto& it : vect) + { + auto str = func(idx, it); + if (!str.size() && skip_empty) + continue; + if (idx > 0) + result << ", "; + result << str; + ++idx; + } + return result.str(); + } + + // inline int get_type_id(nnfusion::element::Type type) + // { + // // TODO: fill more type cases + // if (type == nnfusion::element::f32) + // return DT_FLOAT; + // throw std::runtime_error("Not supported element type."); + // } + + template + inline std::shared_ptr get_op_object(std::shared_ptr& curr) + { + auto _op = static_pointer_cast(curr->get_op_ptr()); + NNFUSION_CHECK_NOT_NULLPTR(_op) << "Node type is not " + << curr->get_op_ptr()->get_op_type(); + return _op; + } + + inline void UNHANDLED_CASE(std::shared_ptr& curr) + { + printf("## Unhandled case for %s:\n", + curr->get_op_ptr()->get_op_type().c_str()); + for (int i = 0; i < curr->get_input_size(); ++i) + printf(">> in-%d : %s\n", + i, + vector_to_string(curr->get_input_shape(i)).c_str()); + for (int i = 0; i < curr->get_output_size(); ++i) + printf(">> out-%d: %s\n", + i, + vector_to_string(curr->get_output_shape(i)).c_str()); + exit(1); + }; + + public: + bool run_on_graph(std::shared_ptr& graph) override + { + currentBackend = "dxcompute"; + + NNFUSION_LOG(INFO) << "Codegen for " << currentBackend << " starts up."; + + auto nodes = graph->get_nodes(); + std::unordered_map, int> din, dout; + + // Count degrees + for (auto& it : nodes) + { + for (auto& in_edge : it->get_in_edges()) + { + if (in_edge->is_control_edge()) + continue; + NNFUSION_CHECK(in_edge->get_dst() == it); + din[it]++; + dout[in_edge->get_src()]++; + } + } + + // Name nodes, legality checks + std::unordered_set> visited, vis_pend, blacklist; + std::unordered_set name_used; + std::unordered_map, std::string> arg_names; + for (auto& it : nodes) + { + NNFUSION_CHECK(it.get() != nullptr); + + auto arg_name = "Z0_" + it->get_op_ptr()->get_op_type() + "_" + + it->get_op_ptr()->get_name(); + for (auto& c : arg_name) + if (!isalpha(c) && !isdigit(c)) + c = '_'; + if (name_used.count(arg_name)) + { + for (int i = 1;; ++i) + { + auto alter = arg_name + "_" + std::to_string(i); + if (!name_used.count(alter)) + { + arg_name = alter; + break; + } + } + } + name_used.insert(arg_name); + arg_names[it] = arg_name; + + if (din[it] == 0 && dout[it] == 0) + visited.insert(it), blacklist.insert(it); + NNFUSION_CHECK(it->get_output_size() == 1); + } + NNFUSION_LOG(INFO) << "There are " << blacklist.size() + << " standalone GNode(s) found."; + name_used.clear(); + + // Fill offsetup nodes + std::deque> gen_q, pend_q; + for (auto& it : nodes) + { + if (visited.count(it)) + continue; + if (din[it] == 0) + { + gen_q.push_back(it); + } + } + + NNFUSION_CHECK( + 0 == + system(("mkdir -p nnfusion_rt/" + currentBackend + "_codegen").c_str())); + + std::ofstream fout("nnfusion_rt/" + currentBackend + "_codegen/nnfusion_rt.h"); + + fout << "#if 1\n\n"; + // Perform blockfusion + int offset = 0, step = 0; + auto new_super_step = [&]() { + while (pend_q.size()) + { + gen_q.push_back(pend_q.front()); + pend_q.pop_front(); + } + if (offset > 0) + ++step, offset = 0; + }; + + auto print_standard_codegen = [&](std::shared_ptr& curr, + std::ofstream& fout, + std::string ir, + std::string options) { + std::string code = autogen(ir); + + if (options.size() > 0) + { + if (options[0] != '|') + options = "|" + options; + if (options.back() != '|') + options += "|"; + } + + if (int(options.find("|memcpy|")) >= 0) + { + NNFUSION_CHECK(curr->get_input_size() == 1); + fout << "NNfusionTensor &" << arg_names[curr] << " = " + << arg_names[curr->get_in_edge(0)->get_src()] << ";\n"; + return; + } + + static std::unordered_map dedupe_kernels; + auto kernel = dedupe_kernels.find(code); + if (kernel == dedupe_kernels.end()) + { + NNFUSION_CHECK(0 == system(("mkdir -p nnfusion_rt/" + currentBackend + + "_codegen/HLSL") + .c_str())); + FILE* fp = fopen(("nnfusion_rt/" + currentBackend + "_codegen/HLSL/" + + arg_names[curr] + ".hlsl") + .c_str(), + "wb"); + NNFUSION_CHECK(fp != nullptr); + NNFUSION_CHECK(code.size() == fwrite(code.c_str(), 1, code.size(), fp)); + fclose(fp); + dedupe_kernels[code] = arg_names[curr]; + kernel = dedupe_kernels.find(code); + } + + fout << "// " << ir << "\n"; + if (int(options.find("|inplace_wg|")) < 0) + { + fout << "NNfusionTensor " << arg_names[curr] << "(device, {" + << join_collections( + curr->get_output_shape(0), + [](int idx, ssize_t it) { return std::to_string(it); }) + << "}, sizeof(" << curr->get_output_element_type(0).c_type_string() + << "));\n"; + + fout << " NNfusionOperator op_" << arg_names[curr] << "(device, {"; + for (int i = 0; i < curr->get_input_size(); ++i) + { + if (i) + fout << ", "; + fout << arg_names[curr->get_in_edge(i)->get_src()]; + } + fout << "}, { " << arg_names[curr] << " }, L\"" << kernel->second + << ".hlsl\");"; + } + else + { + fout << " NNfusionOperator op_" << arg_names[curr] << "(device, {"; + for (int i = 0; i < curr->get_input_size(); ++i) + { + if (i) + fout << ", "; + fout << arg_names[curr->get_in_edge(i)->get_src()]; + } + fout << "}, { " << arg_names[curr->get_in_edge(0)->get_src()] + << " }, L\"" << kernel->second << ".hlsl\");\n"; + fout << "auto& " << arg_names[curr] << " = " + << arg_names[curr->get_in_edge(0)->get_src()] << ";"; + } + }; + + auto codegen_for_elementwise = [&](std::shared_ptr& curr, + std::ofstream& fout, + const std::string& topi, + const std::string& options = "") { + std::string expr = " -"; + for (int i = 0; i < curr->get_input_size(); ++i) + expr += " input(\"input" + std::to_string(i) + "\", @common_shape@);"; + expr += " output(@common_shape@, " + topi + ");"; + + int num_elements = 1, y; + for (auto& it : curr->get_input_shape(0)) + num_elements *= it; + + print_standard_codegen( + curr, + fout, + op::create_code_from_template( + expr, + {{"common_shape", "[ " + std::to_string(num_elements) + " ]"}}), + options); + }; + + std::unordered_map&, std::ofstream&)>> + kernel_dict; + + // Elementwise Ops + kernel_dict["Subtract"] = [&](std::shared_ptr& curr, + std::ofstream& fout) { + codegen_for_elementwise( + curr, fout, "topi=topi.subtract(args(\"input0\"), args(\"input1\"))"); + }; + kernel_dict["Multiply"] = [&](std::shared_ptr& curr, + std::ofstream& fout) { + codegen_for_elementwise( + curr, fout, "topi=topi.multiply(args(\"input0\"), args(\"input1\"))"); + }; + kernel_dict["Divide"] = [&](std::shared_ptr& curr, std::ofstream& fout) { + codegen_for_elementwise( + curr, fout, "topi=topi.divide(args(\"input0\"), args(\"input1\"))"); + }; + kernel_dict["DivNoNan"] = [&](std::shared_ptr& curr, + std::ofstream& fout) { + codegen_for_elementwise( + curr, + fout, + "lambda x: tvm.te.if_then_else(args(\"input1\")[x] != " + "0, args(\"input0\")[x] / args(\"input1\")[x], 0)"); + }; + kernel_dict["Power"] = [&](std::shared_ptr& curr, std::ofstream& fout) { + codegen_for_elementwise( + curr, fout, "topi=topi.power(args(\"input0\"), args(\"input1\"))"); + }; + kernel_dict["LessEq"] = [&](std::shared_ptr& curr, std::ofstream& fout) { + codegen_for_elementwise( + curr, fout, "topi=topi.less_equal(args(\"input0\"), args(\"input1\"))"); + }; + kernel_dict["Equal"] = [&](std::shared_ptr& curr, std::ofstream& fout) { + codegen_for_elementwise( + curr, fout, "topi=topi.equal(args(\"input0\"), args(\"input1\"))"); + }; + kernel_dict["Maximum"] = [&](std::shared_ptr& curr, + std::ofstream& fout) { + codegen_for_elementwise( + curr, fout, "topi=topi.maximum(args(\"input0\"), args(\"input1\"))"); + }; + kernel_dict["Exp"] = [&](std::shared_ptr& curr, std::ofstream& fout) { + codegen_for_elementwise(curr, fout, "topi=topi.exp(args(\"input0\"))"); + }; + kernel_dict["Negative"] = [&](std::shared_ptr& curr, + std::ofstream& fout) { + codegen_for_elementwise(curr, fout, "topi=topi.negative(args(\"input0\"))"); + }; + kernel_dict["Tanh"] = [&](std::shared_ptr& curr, std::ofstream& fout) { + codegen_for_elementwise(curr, fout, "topi=topi.tanh(args(\"input0\"))"); + }; + kernel_dict["Relu6"] = [&](std::shared_ptr& curr, std::ofstream& fout) { + codegen_for_elementwise( + curr, fout, "topi=topi.clip(args(\"input0\"), 0, 6)"); + }; + kernel_dict["Sigmoid"] = [&](std::shared_ptr& curr, + std::ofstream& fout) { + codegen_for_elementwise(curr, fout, "topi=topi.sigmoid(args(\"input0\"))"); + }; + kernel_dict["Square"] = [&](std::shared_ptr& curr, std::ofstream& fout) { + codegen_for_elementwise( + curr, fout, "topi=topi.multiply(args(\"input0\"), args(\"input0\"))"); + }; + kernel_dict["Rsqrt"] = [&](std::shared_ptr& curr, std::ofstream& fout) { + codegen_for_elementwise(curr, fout, "topi=topi.rsqrt(args(\"input0\"))"); + }; + kernel_dict["Log"] = [&](std::shared_ptr& curr, std::ofstream& fout) { + codegen_for_elementwise(curr, fout, "topi=topi.log(args(\"input0\"))"); + }; + kernel_dict["ReluBackprop"] = [&](std::shared_ptr& curr, + std::ofstream& fout) { + codegen_for_elementwise( + curr, + fout, + "lambda x: tvm.te.if_then_else(args(\"input0\")[x] > " + "0, args(\"input1\")[x], 0)"); + }; + kernel_dict["Select"] = [&](std::shared_ptr& curr, std::ofstream& fout) { + codegen_for_elementwise( + curr, + fout, + "lambda x: tvm.te.if_then_else(args(\"input0\")[x] == " + "0, args(\"input2\")[x], args(\"input1\")[x])"); + }; + + // Non-standard Ops + kernel_dict["Constant"] = [&](std::shared_ptr& curr, + std::ofstream& fout) { + auto p_const = std::dynamic_pointer_cast(curr->get_op_ptr()); + NNFUSION_CHECK(p_const != nullptr); + const void* dptr = p_const->get_data_ptr(); + size_t size = p_const->get_data_size(); + + NNFUSION_CHECK(0 == system(("mkdir -p nnfusion_rt/" + currentBackend + + "_codegen/Constant") + .c_str())); + FILE* fp = fopen(("nnfusion_rt/" + currentBackend + "_codegen/Constant/" + + arg_names[curr]) + .c_str(), + "wb"); + NNFUSION_CHECK(fp != nullptr); + NNFUSION_CHECK(size == fwrite(dptr, 1, size, fp)); + fclose(fp); + + fout << "NNfusionTensor " << arg_names[curr] << "(device, {" + << join_collections( + curr->get_output_shape(0), + [](int idx, ssize_t it) { return std::to_string(it); }) + << "}, sizeof(" << curr->get_output_element_type(0).c_type_string() + << "));\n"; + + fout << " NNfusionMemcpy op_" << arg_names[curr] << "(device, " + << arg_names[curr] << ", load_data<" + << curr->get_output_element_type(0).c_type_string() << ">(\"" + << arg_names[curr] << "\", " << arg_names[curr] + << ".NumElements()), true);\n"; + }; + + kernel_dict["Parameter"] = [&](std::shared_ptr& curr, + std::ofstream& fout) { + fout << "NNfusionTensor " << arg_names[curr] << "(device, {" + << join_collections( + curr->get_output_shape(0), + [](int idx, ssize_t it) { return std::to_string(it); }) + << "}, sizeof(" << curr->get_output_element_type(0).c_type_string() + << "));\n"; + + fout << " NNfusionMemcpy op_" << arg_names[curr] << "(device, " + << arg_names[curr] << ", load_data<" + << curr->get_output_element_type(0).c_type_string() << ">(\"\", " + << arg_names[curr] << ".NumElements()));\n"; + }; + + kernel_dict["Result"] = [&](std::shared_ptr& curr, std::ofstream& fout) { + fout << "NNfusionMemcpy " << arg_names[curr] << "(device, nullptr, " + << arg_names[curr->get_in_edge(0)->get_src()] << ");\n"; + }; + + while (gen_q.size() > 0 || pend_q.size() > 0) + { + // Move to new super step if satisifed + if (!gen_q.size()) + new_super_step(); + + auto curr = gen_q.front(); + gen_q.pop_front(); + visited.insert(curr); + + auto entry = kernel_dict.find(curr->get_op_ptr()->get_op_type()); + if (entry != kernel_dict.end()) + entry->second(curr, fout); + else + { + auto ir = nnfusion::op::get_translation_v2(curr); + if (ir.empty()) + ir = nnfusion::op::get_translation(curr); + if (ir != "") + { + const char annotation[] = "## @annotation: "; + int pos = ir.find(annotation); + std::string options; + if (pos >= 0) + { + pos += sizeof(annotation) - 1; + options = ir.substr(pos); + } + print_standard_codegen(curr, fout, ir, options); + } + else + UNHANDLED_CASE(curr); + } + fout << std::endl; + + // Check its children about whether all inputs are ready (Must be put after any possible new_super_step()) + for (auto& edge : curr->get_out_edges()) + { + if (edge->is_control_edge()) + continue; + NNFUSION_CHECK(edge->get_src() == curr); + NNFUSION_CHECK(visited.count(edge->get_dst()) == 0); + + bool ready = true; + for (auto& from : edge->get_dst()->get_in_edges()) + { + if (from->is_control_edge()) + continue; + if (visited.count(from->get_src()) == 0) + { + ready = false; + break; + } + } + if (ready) + { + // Only join pend_q once + if (vis_pend.count(edge->get_dst()) == 0) + { + vis_pend.insert(edge->get_dst()); + pend_q.push_back(edge->get_dst()); + } + } + } + } + + fout << "#endif\n\n"; + fout << R"( + device.pCommandQueue->ExecuteCommandLists(preloadQueue.size(), preloadQueue.data()); + device.pCommandQueue->ExecuteCommandLists(cmdQueue.size(), cmdQueue.data()); + device.AwaitExecution(); +)"; + // Print Results + for (auto& curr : graph->get_outputs()) // Print output nodes + { + if (blacklist.count(curr)) + continue; + fout << arg_names[curr] << ".PrintStageBuffer<" + << curr->get_output_element_type(0).c_type_string() << ">(device, \"" + << arg_names[curr] << "\");\n"; + } + + fout << std::endl; + + nnfusion::codegen::copy_file_from_templates( + currentBackend + "/DxCompute.vcxproj", + "nnfusion_rt/" + currentBackend + "_codegen/DxCompute.vcxproj"); + nnfusion::codegen::copy_file_from_templates(currentBackend + "/run_graph.cpp", + "nnfusion_rt/" + currentBackend + + "_codegen/run_graph.cpp"); + nnfusion::codegen::copy_file_from_templates(currentBackend + "/d3dx12_helper.h", + "nnfusion_rt/" + currentBackend + + "_codegen/d3dx12_helper.h"); + nnfusion::codegen::copy_file_from_templates( + currentBackend + "/d3dx12_nnfusion.h", + "nnfusion_rt/" + currentBackend + "_codegen/d3dx12_nnfusion.h"); + NNFUSION_LOG(INFO) << currentBackend << " codegen finished."; + exit(0); + return true; + } + }; + } // namespace pass + } // namespace graph +} // namespace nnfusion From 64af7da37f8523f2046bc45a5647692f2b44ec45 Mon Sep 17 00:00:00 2001 From: Niupple Date: Fri, 27 Nov 2020 09:28:11 +0000 Subject: [PATCH 22/32] code sytle applied --- .../kernels/cuda_gpu/kernels/convolution.cpp | 1 - .../core/kernels/cuda_gpu/kernels/dot.cpp | 134 +++++++++--------- .../core/kernels/kernel_registration.cpp | 1 + 3 files changed, 69 insertions(+), 67 deletions(-) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp index 549d428c6..6b91e3956 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp @@ -87,7 +87,6 @@ LanguageUnit_p cuda::ConvolutionCudnn::emit_function_body() padding_below[i] = static_cast(padding_below_diff[i]); } - { // lu << "cudnnDataType_t data_type = " << get_cudnn_datatype(dtype) << ";\n"; lu << cudnn_tensor_descriptor_from_shape(input_shape, "tensor_desc_0", input_type) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp index 7838e4300..2b3a1543d 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp @@ -203,7 +203,9 @@ LanguageUnit_p cuda::Dot::emit_function_body() << " static_cast(output0)," << " " << n << "));\n"; } - } else if (dtype == element::f16) { + } + else if (dtype == element::f16) + { // case 1: Scalar * Tensor // if (arg0_shape.empty() || arg1_shape.empty()) // { @@ -282,83 +284,83 @@ LanguageUnit_p cuda::Dot::emit_function_body() // << " static_cast(output0)," // << " " << m << "));\n"; // } else { - size_t axes_for_m_count = arg0_shape.size() - reduction_axes; - size_t axes_for_n_count = arg1_shape.size() - reduction_axes; - size_t axes_for_k_count = reduction_axes; - size_t m = 1; - size_t n = 1; - size_t k = 1; + size_t axes_for_m_count = arg0_shape.size() - reduction_axes; + size_t axes_for_n_count = arg1_shape.size() - reduction_axes; + size_t axes_for_k_count = reduction_axes; + size_t m = 1; + size_t n = 1; + size_t k = 1; - // check if input and output size correct - // check and calculate k for arg0 and arg1 - size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k - size_t arg1_k_idx = 0; // first axe in arg1 for k + // check if input and output size correct + // check and calculate k for arg0 and arg1 + size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k + size_t arg1_k_idx = 0; // first axe in arg1 for k - for (size_t i = 0; i < axes_for_k_count; i++) + for (size_t i = 0; i < axes_for_k_count; i++) + { + k *= arg0_shape[arg0_k_idx]; + if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++]) { - k *= arg0_shape[arg0_k_idx]; - if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++]) - { - std::vector arg_vec{"arg0", "arg1"}; - std::vector shape_vec{arg0_shape, arg1_shape}; + std::vector arg_vec{"arg0", "arg1"}; + std::vector shape_vec{arg0_shape, arg1_shape}; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; - } + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; } - // check and calculate m for arg0 and out - size_t arg0_m_idx = 0; // first axe in arg0 for m - size_t out_m_idx = 0; // first axe in out for m - for (size_t i = 0; i < axes_for_m_count; i++) + } + // check and calculate m for arg0 and out + size_t arg0_m_idx = 0; // first axe in arg0 for m + size_t out_m_idx = 0; // first axe in out for m + for (size_t i = 0; i < axes_for_m_count; i++) + { + m *= arg0_shape[arg0_m_idx]; + if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++]) { - m *= arg0_shape[arg0_m_idx]; - if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++]) - { - std::vector arg_vec{"arg0", "output"}; - std::vector shape_vec{arg0_shape, out_shape}; + std::vector arg_vec{"arg0", "output"}; + std::vector shape_vec{arg0_shape, out_shape}; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; - } + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; } - // check and calculate n for arg1 and out - size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n - size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n - for (size_t i = 0; i < axes_for_n_count; i++) + } + // check and calculate n for arg1 and out + size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n + size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n + for (size_t i = 0; i < axes_for_n_count; i++) + { + n *= arg1_shape[arg1_n_idx]; + if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++]) { - n *= arg1_shape[arg1_n_idx]; - if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++]) - { - std::vector arg_vec{"arg1", "output"}; - std::vector shape_vec{arg1_shape, out_shape}; + std::vector arg_vec{"arg1", "output"}; + std::vector shape_vec{arg1_shape, out_shape}; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; - } + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; } + } - lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; - - lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," - << " CUBLAS_OP_N," - << " CUBLAS_OP_N," - << " " << n << "," - << " " << m << "," - << " " << k << "," - << " &alpha," - << " static_cast(input1)," - << " " << n << "," - << " static_cast(input0)," - << " " << k << "," - << " &beta," - << " static_cast(output0)," - << " " << n << "));\n"; + lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; + + lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," + << " CUBLAS_OP_N," + << " CUBLAS_OP_N," + << " " << n << "," + << " " << m << "," + << " " << k << "," + << " &alpha," + << " static_cast(input1)," + << " " << n << "," + << " static_cast(input0)," + << " " << k << "," + << " &beta," + << " static_cast(output0)," + << " " << n << "));\n"; // } } else { diff --git a/src/nnfusion/core/kernels/kernel_registration.cpp b/src/nnfusion/core/kernels/kernel_registration.cpp index 9aba33df2..786c41d87 100644 --- a/src/nnfusion/core/kernels/kernel_registration.cpp +++ b/src/nnfusion/core/kernels/kernel_registration.cpp @@ -5,6 +5,7 @@ #include "nnfusion/common/type/element_type.hpp" #include "nnfusion/util/util.hpp" #include "ngraph/src/nnfusion/common/type/element_type.hpp" +#include "nnfusion/util/util.hpp" using namespace nnfusion; using namespace nnfusion::kernels; From 3dfda190d56876df44902f757757804963572ec5 Mon Sep 17 00:00:00 2001 From: Niupple Date: Fri, 11 Dec 2020 14:02:15 +0800 Subject: [PATCH 23/32] fix DataBuffer --- src/nnfusion/frontend/onnx_import/util/graph_convert.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nnfusion/frontend/onnx_import/util/graph_convert.cpp b/src/nnfusion/frontend/onnx_import/util/graph_convert.cpp index 2c29954c2..dfa79661f 100644 --- a/src/nnfusion/frontend/onnx_import/util/graph_convert.cpp +++ b/src/nnfusion/frontend/onnx_import/util/graph_convert.cpp @@ -143,7 +143,7 @@ namespace nnfusion onnx::ModelProto proto_without_init; proto_without_init.CopyFrom(model_proto); proto_without_init.mutable_graph()->mutable_initializer()->Clear(); - NNFUSION_LOG(INFO) << proto_without_init.DebugString(); + // NNFUSION_LOG(INFO) << proto_without_init.DebugString(); } std::string From 0bd8eba41722ea1e01c939f0f8bd2116f994fee6 Mon Sep 17 00:00:00 2001 From: Niupple Date: Fri, 11 Dec 2020 17:10:16 +0800 Subject: [PATCH 24/32] onnx changed to DataBuffer style import --- .../frontend/onnx_import/core/tensor.hpp | 50 ++---- .../frontend/onnx_import/util/util.cpp | 144 ++++++++++++++---- .../frontend/onnx_import/util/util.hpp | 5 +- 3 files changed, 133 insertions(+), 66 deletions(-) diff --git a/src/nnfusion/frontend/onnx_import/core/tensor.hpp b/src/nnfusion/frontend/onnx_import/core/tensor.hpp index 415abc080..85a624b14 100644 --- a/src/nnfusion/frontend/onnx_import/core/tensor.hpp +++ b/src/nnfusion/frontend/onnx_import/core/tensor.hpp @@ -22,6 +22,7 @@ #pragma once #include "../util/util.hpp" +#include "ngraph/src/nnfusion/common/type/data_buffer.hpp" namespace nnfusion { @@ -55,50 +56,31 @@ namespace nnfusion return detail::get_data(*m_tensor_proto); } + DataBuffer buffer_get_data() const + { + return detail::buffer_get_data(*m_tensor_proto); + } + const std::string& get_name() const { NNFUSION_CHECK(m_tensor_proto->has_name()) << "tensor has no name specified."; return m_tensor_proto->name(); } - const element::Type& get_ng_type() const + element::Type get_ng_type() const { NNFUSION_CHECK(m_tensor_proto->has_data_type()) << "tensor has no data type specified."; - switch (m_tensor_proto->data_type()) - { - case onnx::TensorProto_DataType::TensorProto_DataType_BOOL: - return element::boolean; - case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT: - case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16: - return element::f32; - case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE: - return element::f64; - case onnx::TensorProto_DataType::TensorProto_DataType_INT8: return element::i8; - case onnx::TensorProto_DataType::TensorProto_DataType_INT16: - return element::i16; - case onnx::TensorProto_DataType::TensorProto_DataType_INT32: - return element::i32; - case onnx::TensorProto_DataType::TensorProto_DataType_INT64: - return element::i64; - case onnx::TensorProto_DataType::TensorProto_DataType_UINT8: return element::u8; - case onnx::TensorProto_DataType::TensorProto_DataType_UINT16: - return element::u16; - case onnx::TensorProto_DataType::TensorProto_DataType_UINT32: - return element::u32; - case onnx::TensorProto_DataType::TensorProto_DataType_UINT64: - return element::u64; - case onnx::TensorProto_DataType::TensorProto_DataType_UNDEFINED: - NNFUSION_CHECK_FAIL() << "data type is not defined"; - break; - default: - NNFUSION_CHECK_FAIL() - << "unsupported data type: " - << onnx::TensorProto_DataType_Name( - onnx::TensorProto_DataType(m_tensor_proto->data_type())); - break; - } + element::Type element_type; + bool status; + status = ONNXDataTypeToNNFusionElementType( + static_cast(m_tensor_proto->data_type()), + &element_type); + NNFUSION_CHECK(status) << "Data type not supported: " + << m_tensor_proto->data_type(); + + return element_type; } operator onnx::TensorProto_DataType() const diff --git a/src/nnfusion/frontend/onnx_import/util/util.cpp b/src/nnfusion/frontend/onnx_import/util/util.cpp index d6f52653d..28a56620f 100644 --- a/src/nnfusion/frontend/onnx_import/util/util.cpp +++ b/src/nnfusion/frontend/onnx_import/util/util.cpp @@ -28,7 +28,7 @@ namespace nnfusion { namespace onnx_import { - bool ONNXDataTypeToNNFusionElementType(const onnx::TensorProto_DataType onnx_dt, + bool ONNXDataTypeToNNFusionElementType(onnx::TensorProto_DataType onnx_dt, nnfusion::element::Type* nnfusion_et) { switch (onnx_dt) @@ -36,8 +36,10 @@ namespace nnfusion case onnx::TensorProto_DataType::TensorProto_DataType_BOOL: *nnfusion_et = element::boolean; break; - case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT: case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16: + *nnfusion_et = element::f16; + break; + case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT: *nnfusion_et = element::f32; break; case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE: @@ -86,35 +88,38 @@ namespace nnfusion const Shape shape, const Tensor& tensor) { - switch (onnx_et) - { - case onnx::TensorProto_DataType::TensorProto_DataType_BOOL: - return make_constant_op(element::boolean, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT: - case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16: - return make_constant_op(element::f32, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE: - return make_constant_op(element::f64, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_INT8: - return make_constant_op(element::i8, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_INT16: - return make_constant_op(element::i16, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_INT32: - return make_constant_op(element::i32, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_INT64: - return make_constant_op(element::i64, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_UINT8: - return make_constant_op(element::u8, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_UINT16: - return make_constant_op(element::u16, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_UINT32: - return make_constant_op(element::u32, shape, tensor); - case onnx::TensorProto_DataType::TensorProto_DataType_UINT64: - return make_constant_op(element::u64, shape, tensor); - default: - NNFUSION_CHECK_FAIL() << "unsupported value info element type: " - << onnx::TensorProto_DataType_Name(onnx_et); - } + element::Type element_type = tensor.get_ng_type(); + return std::make_shared( + element_type, shape, tensor.buffer_get_data()); + // switch (onnx_et) + // { + // case onnx::TensorProto_DataType::TensorProto_DataType_BOOL: + // return make_constant_op(element::boolean, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT: + // case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16: + // return make_constant_op(element::f32, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE: + // return make_constant_op(element::f64, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_INT8: + // return make_constant_op(element::i8, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_INT16: + // return make_constant_op(element::i16, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_INT32: + // return make_constant_op(element::i32, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_INT64: + // return make_constant_op(element::i64, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_UINT8: + // return make_constant_op(element::u8, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_UINT16: + // return make_constant_op(element::u16, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_UINT32: + // return make_constant_op(element::u32, shape, tensor); + // case onnx::TensorProto_DataType::TensorProto_DataType_UINT64: + // return make_constant_op(element::u64, shape, tensor); + // default: + // NNFUSION_CHECK_FAIL() << "unsupported value info element type: " + // << onnx::TensorProto_DataType_Name(onnx_et); + // } } std::shared_ptr GetInputNode(const NodeMap& all_ng_nodes, @@ -280,6 +285,83 @@ namespace nnfusion name, std::vector(kernel_shape.size(), 1UL)); } + DataBuffer detail::buffer_get_data(const onnx::TensorProto& tensor) + { + size_t n_element = 1; + element::Type type; + bool status; + auto onnx_dt = static_cast(tensor.data_type()); + + status = ONNXDataTypeToNNFusionElementType(onnx_dt, &type); + + NNFUSION_CHECK(status) << "Unsupported ONNX data_type " << tensor.data_type() + << " is found"; + + DataBuffer buf(type); + + for (auto dim : tensor.dims()) + { + n_element *= dim; + } + buf.resize(n_element); + + if (tensor.has_raw_data()) + { + buf.load(tensor.raw_data().data(), n_element); + } + else + { +#define GET_VALUE(pb_type, mid_type) \ + do \ + { \ + const void* dat; \ + mid_type m; \ + NNFUSION_CHECK(n_element == tensor.pb_type##_data_size()) \ + << "Tensor shape is not the same with tensor data_size. (" << n_element \ + << " != " << tensor.pb_type##_data_size() << ")"; \ + for (size_t i = 0; i < n_element; ++i) \ + { \ + m = static_cast(tensor.pb_type##_data()[i]); \ + dat = reinterpret_cast(&m); \ + buf.setElement(i, dat); \ + } \ + } while (0) + + switch (onnx_dt) + { + case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16: + GET_VALUE(int32, element::half); + break; + case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT: + GET_VALUE(float, float); + break; + case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE: + GET_VALUE(double, double); + break; + case onnx::TensorProto_DataType::TensorProto_DataType_INT32: + GET_VALUE(int32, int32_t); + break; + case onnx::TensorProto_DataType::TensorProto_DataType_INT64: + GET_VALUE(int64, int64_t); + break; + case onnx::TensorProto_DataType::TensorProto_DataType_UINT64: + GET_VALUE(uint64, uint64_t); + break; + case onnx::TensorProto_DataType::TensorProto_DataType_UINT32: + case onnx::TensorProto_DataType::TensorProto_DataType_BOOL: + case onnx::TensorProto_DataType::TensorProto_DataType_INT16: + case onnx::TensorProto_DataType::TensorProto_DataType_INT8: + case onnx::TensorProto_DataType::TensorProto_DataType_UINT8: + case onnx::TensorProto_DataType::TensorProto_DataType_UINT16: + default: + NNFUSION_CHECK_FAIL() << "unsupported onnx element type: " + << onnx::TensorProto_DataType_Name(onnx_dt); + } +#undef GET_VALUE + } + return buf; + } + } // namespace onnx_import } // namespace frontend } // namespace nnfusion diff --git a/src/nnfusion/frontend/onnx_import/util/util.hpp b/src/nnfusion/frontend/onnx_import/util/util.hpp index 871ab4801..31bf27726 100644 --- a/src/nnfusion/frontend/onnx_import/util/util.hpp +++ b/src/nnfusion/frontend/onnx_import/util/util.hpp @@ -28,6 +28,7 @@ #include #include "../onnx_base.hpp" +#include "ngraph/src/nnfusion/common/type/data_buffer.hpp" #include "nnfusion/common/common.hpp" namespace nnfusion @@ -51,6 +52,8 @@ namespace nnfusion return {it, it + (raw_data.size() / sizeof(T))}; } + DataBuffer buffer_get_data(const onnx::TensorProto& tensor); + template inline std::vector get_data(const onnx::TensorProto& tensor) { @@ -186,7 +189,7 @@ namespace nnfusion class Tensor; class Node; - bool ONNXDataTypeToNNFusionElementType(const onnx::TensorProto_DataType onnx_dt, + bool ONNXDataTypeToNNFusionElementType(onnx::TensorProto_DataType onnx_dt, nnfusion::element::Type* nnfusion_et); template From 9cefad53f66ed3c324c79a78cbbbfaf6c81facbf Mon Sep 17 00:00:00 2001 From: Niupple Date: Thu, 17 Dec 2020 11:22:50 +0800 Subject: [PATCH 25/32] fix onnx fp16 --- src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp | 4 ++-- src/nnfusion/frontend/onnx_import/op/constant.hpp | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp b/src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp index 76d40424d..42c782ae4 100644 --- a/src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp +++ b/src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp @@ -25,6 +25,6 @@ namespace nnfusion public: bool run_on_graph(std::shared_ptr& graph) override; }; - } // namespace pass - } // namespace graph + } // namespace graph + } // namespace pass } // namespace nnfusion diff --git a/src/nnfusion/frontend/onnx_import/op/constant.hpp b/src/nnfusion/frontend/onnx_import/op/constant.hpp index f3dd0bfc9..4280f9f86 100644 --- a/src/nnfusion/frontend/onnx_import/op/constant.hpp +++ b/src/nnfusion/frontend/onnx_import/op/constant.hpp @@ -65,8 +65,11 @@ namespace nnfusion Node node(node_proto); auto tensor = node.get_attribute_value("value"); - const auto& func_param = ONNX_CONST_MAP().at(tensor.get_ng_type()); - auto op = func_param(tensor.get_ng_type(), tensor); + // const auto& func_param = ONNX_CONST_MAP().at(tensor.get_ng_type()); + // auto op = func_param(tensor.get_ng_type(), tensor); + auto op = std::make_shared( + tensor.get_ng_type(), tensor.get_shape(), tensor.buffer_get_data() + ); op->set_name(node_proto.output(0)); auto gnode = m_graph->add_node_and_edge(op, graph::GNodeVector({})); From 6ded1628d9febdfdd8003d7b2ec11f992f3a2b27 Mon Sep 17 00:00:00 2001 From: Niupple Date: Tue, 29 Dec 2020 13:34:29 +0800 Subject: [PATCH 26/32] bert l1 runnable --- .../core/kernels/cuda_gpu/cuda_helper.cpp | 5 + .../kernels/cuda_gpu/kernels/batch_matmul.cpp | 18 ++- .../core/kernels/cuda_gpu/kernels/dot.cpp | 147 +++++++++--------- .../core/kernels/cuda_gpu/kernels/reduce.hpp | 2 +- .../core/kernels/cuda_gpu/kernels/softmax.cpp | 3 +- .../engine/pass/graph/kernel_tuning.cpp | 6 + .../frontend/onnx_import/core/tensor.hpp | 2 +- .../frontend/onnx_import/op/constant.hpp | 3 +- .../frontend/onnx_import/util/util.hpp | 2 +- 9 files changed, 104 insertions(+), 84 deletions(-) diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp index c1809a5cb..54a73cfb1 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp @@ -33,6 +33,11 @@ LanguageUnit_p cuda::get_math_kernel(const std::string& name, writer << ")\n"; writer << "{\n"; writer.indent++; + if (name == "convert" && data_types[num_inputs] == "half" && data_types[0] == "int64_t") + { + writer << "return (long long)" + math_kernel << ";\n"; + } + else { writer << "return " + math_kernel << ";\n"; } diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp index 173e95e93..c42d7780c 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp @@ -8,6 +8,7 @@ // [a] ./new_kernel_0.cpp // [b] ../../../ops/op_define/new_op_0.cpp +#include #include "../cuda_emitter.hpp" #include "../cuda_langunit.hpp" #include "nnfusion/core/operators/generic_op/generic_op.hpp" @@ -52,6 +53,15 @@ namespace nnfusion const nnfusion::Shape& input_shape_0 = m_context->inputs[0]->get_shape(); const nnfusion::Shape& input_shape_1 = m_context->inputs[1]->get_shape(); + element::Type dtype0 = m_context->inputs[0]->get_element_type(); + element::Type dtype1 = m_context->inputs[1]->get_element_type(); + element::Type dtype2 = m_context->outputs[0]->get_element_type(); + NNFUSION_CHECK(dtype0 == dtype1 && dtype1 == dtype2) + << "Unsupported element type combination of (" << dtype0.c_type_string() + << ", " << dtype1.c_type_string() << ") -> " << dtype2.c_type_string() + << "."; + element::Type& dtype = dtype0; + bool transA = generic_op->localOpConfig.getRoot()["adj_x"]["b"]; bool transB = generic_op->localOpConfig.getRoot()["adj_y"]["b"]; size_t A1 = 1LU; @@ -92,10 +102,11 @@ namespace nnfusion stride_b = A2 * A3, ldc = A4, stride_c = A2 * A4; } + std::string type = dtype.c_type_string(); float alpha = 1.0f, beta = 0.0f; auto code = nnfusion::op::create_code_from_template( R"( - static const float alpha = @alpha@F, beta = @beta@F; + static const @dtype@ alpha = @alpha@, beta = @beta@; // if (!@hCublas@) // CUBLAS_SAFE_CALL(@api_create@(&@hCublas@)); CUBLAS_SAFE_CALL(@api_exec@( @@ -106,7 +117,9 @@ namespace nnfusion { {"hCublas", "cublas_handle"}, {"api_create", "cublasCreate"}, - {"api_exec", "cublasSgemmStridedBatched"}, + {"api_exec", + dtype == element::f32 ? "cublasSgemmStridedBatched" + : "cublasHgemmStridedBatched"}, {"transA", transB ? "CUBLAS_OP_T" : "CUBLAS_OP_N"}, {"transB", transA ? "CUBLAS_OP_T" : "CUBLAS_OP_N"}, {"alpha", alpha}, @@ -121,6 +134,7 @@ namespace nnfusion {"stride_b", stride_b}, {"stride_c", stride_c}, {"batch", A1}, + {"dtype", type}, }); LanguageUnit_p _lu(new LanguageUnit(get_function_name())); diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp index 2b3a1543d..8b4856c53 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp @@ -86,7 +86,7 @@ LanguageUnit_p cuda::Dot::emit_function_body() // matrix * vector else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1)) { - lu << "const float alpha = 1.0;\n const float beta = 0;\n"; + lu << "const float alpha = 1.0;\n const float beta = 0.;\n"; lu << "CUBLAS_SAFE_CALL(cublasSgemv(cublas_handle, "; if (trans_A) lu << "CUBLAS_OP_N, " << arg0_shape[0] << ", " << arg0_shape[1] << ", "; @@ -107,7 +107,7 @@ LanguageUnit_p cuda::Dot::emit_function_body() int n = trans_A ? arg0_shape[1] : arg0_shape[0]; int k = trans_A ? arg0_shape[0] : arg0_shape[1]; - lu << "const float alpha = 1.0;\nconst float beta = 0;\n"; + lu << "const float alpha = 1.0;\nconst float beta = 0.;\n"; lu << "CUBLAS_SAFE_CALL(cublasSgemm(cublas_handle," << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") @@ -186,7 +186,7 @@ LanguageUnit_p cuda::Dot::emit_function_body() } } - lu << "const float alpha = 1.0;\nconst float beta = 0;\n"; + lu << "const float alpha = 1.0;\nconst float beta = 0.;\n"; lu << "CUBLAS_SAFE_CALL(cublasSgemm(cublas_handle," << " CUBLAS_OP_N," @@ -261,89 +261,84 @@ LanguageUnit_p cuda::Dot::emit_function_body() // << " static_cast(output0)," // << " 1));\n"; // } - // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) && - // (trans_A || trans_B)) - // { - // int m = trans_B ? arg1_shape[0] : arg1_shape[1]; - // int n = trans_A ? arg0_shape[1] : arg0_shape[0]; - // int k = trans_A ? arg0_shape[0] : arg0_shape[1]; + if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) && + (trans_A || trans_B)) + { + int m = trans_B ? arg1_shape[0] : arg1_shape[1]; + int n = trans_A ? arg0_shape[1] : arg0_shape[0]; + int k = trans_A ? arg0_shape[0] : arg0_shape[1]; - // lu << "const half alpha = 1.0;\nconst half beta = 0;\n"; + lu << "const half alpha = 1.0;\nconst half beta = 0.;\n"; - // lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," - // << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") - // << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << "," - // << " " << n << "," - // << " " << k << "," - // << " &alpha," - // << " static_cast(input1)," - // << " " << arg1_shape[1] << "," - // << " static_cast(input0)," - // << " " << arg0_shape[1] << "," - // << " &beta," - // << " static_cast(output0)," - // << " " << m << "));\n"; - // } else { - size_t axes_for_m_count = arg0_shape.size() - reduction_axes; - size_t axes_for_n_count = arg1_shape.size() - reduction_axes; - size_t axes_for_k_count = reduction_axes; - size_t m = 1; - size_t n = 1; - size_t k = 1; + lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," + << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") + << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << "," + << " " << n << "," + << " " << k << "," + << " &alpha," + << " static_cast(input1)," + << " " << arg1_shape[1] << "," + << " static_cast(input0)," + << " " << arg0_shape[1] << "," + << " &beta," + << " static_cast(output0)," + << " " << m << "));\n"; + } + else + { + size_t axes_for_m_count = arg0_shape.size() - reduction_axes; + size_t axes_for_n_count = arg1_shape.size() - reduction_axes; + size_t axes_for_k_count = reduction_axes; + size_t m = 1; + size_t n = 1; + size_t k = 1; - // check if input and output size correct - // check and calculate k for arg0 and arg1 - size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k - size_t arg1_k_idx = 0; // first axe in arg1 for k + // check if input and output size correct + // check and calculate k for arg0 and arg1 + size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k + size_t arg1_k_idx = 0; // first axe in arg1 for k - for (size_t i = 0; i < axes_for_k_count; i++) - { - k *= arg0_shape[arg0_k_idx]; - if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++]) + for (size_t i = 0; i < axes_for_k_count; i++) { - std::vector arg_vec{"arg0", "arg1"}; - std::vector shape_vec{arg0_shape, arg1_shape}; + k *= arg0_shape[arg0_k_idx]; + if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++]) + { + std::vector arg_vec{"arg0", "arg1"}; + std::vector shape_vec{arg0_shape, arg1_shape}; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } } - } - // check and calculate m for arg0 and out - size_t arg0_m_idx = 0; // first axe in arg0 for m - size_t out_m_idx = 0; // first axe in out for m - for (size_t i = 0; i < axes_for_m_count; i++) - { - m *= arg0_shape[arg0_m_idx]; - if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++]) + // check and calculate m for arg0 and out + size_t arg0_m_idx = 0; // first axe in arg0 for m + size_t out_m_idx = 0; // first axe in out for m + for (size_t i = 0; i < axes_for_m_count; i++) { - std::vector arg_vec{"arg0", "output"}; - std::vector shape_vec{arg0_shape, out_shape}; + m *= arg0_shape[arg0_m_idx]; + if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++]) + { + std::vector arg_vec{"arg0", "output"}; + std::vector shape_vec{arg0_shape, out_shape}; - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; + } } - } - // check and calculate n for arg1 and out - size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n - size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n - for (size_t i = 0; i < axes_for_n_count; i++) - { - n *= arg1_shape[arg1_n_idx]; - if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++]) + // check and calculate n for arg1 and out + size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n + size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n + for (size_t i = 0; i < axes_for_n_count; i++) { - std::vector arg_vec{"arg1", "output"}; - std::vector shape_vec{arg1_shape, out_shape}; - - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; - } - } + n *= arg1_shape[arg1_n_idx]; + if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++]) + { + std::vector arg_vec{"arg1", "output"}; + std::vector shape_vec{arg1_shape, out_shape}; lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; @@ -527,7 +522,7 @@ LanguageUnit_p cuda::Dot::emit_function_body() } else { - NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot."; + NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for kernel dot."; } //lu.block_end(); return _lu; diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.hpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.hpp index 97353e5e8..c9bfb3c26 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.hpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.hpp @@ -189,7 +189,7 @@ int data_idx_offset = block_idx * width; float val = 0.0; for (int tidx = thread_idx; tidx < width; tidx += block_size) { int data_idx = tidx + data_idx_offset; - val += input0[data_idx]; + val += static_cast(input0[data_idx]); } val = reduceSum(val, thread_idx, block_size, shm); if (thread_idx == 0) output0[block_idx] = val; diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp index 4f5bfa067..b83e9a832 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp @@ -203,7 +203,8 @@ LanguageUnit_p { LanguageUnit_p _lu(new LanguageUnit); auto& lu = *_lu; - string data_type = "CUDNN_DATA_FLOAT"; //cuda::get_cudnn_datatype(type); + element::Type type = m_context->inputs[0]->get_element_type(); + string data_type = cuda::get_cudnn_datatype(type); string tensor_format = "CUDNN_TENSOR_NCHW"; lu << "cudnnTensorDescriptor_t " << desc << ";\n"; lu << "CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&" << desc << "));\n"; diff --git a/src/nnfusion/engine/pass/graph/kernel_tuning.cpp b/src/nnfusion/engine/pass/graph/kernel_tuning.cpp index 068f7c904..751c5dc40 100644 --- a/src/nnfusion/engine/pass/graph/kernel_tuning.cpp +++ b/src/nnfusion/engine/pass/graph/kernel_tuning.cpp @@ -91,6 +91,12 @@ void print_tuning_results(std::vector> tuned_kerne << std::setw(10) << s->status << " | " << std::setw(6) << s->progress_step << "/" << FLAGS_fkernel_tuning_steps << " " << " | " << std::setw(12) << s->best_perf << " ms |\n"; + + if (fabs(s->best_perf + 1.0) < 1e-5) + { + NNFUSION_LOG(INFO) << "Kernel named \"" << s->op_name << "\" has not yet been tuned.\n" + << s->ir; + } } NNFUSION_LOG(INFO) << ss.str(); } diff --git a/src/nnfusion/frontend/onnx_import/core/tensor.hpp b/src/nnfusion/frontend/onnx_import/core/tensor.hpp index 85a624b14..201aa580d 100644 --- a/src/nnfusion/frontend/onnx_import/core/tensor.hpp +++ b/src/nnfusion/frontend/onnx_import/core/tensor.hpp @@ -22,7 +22,7 @@ #pragma once #include "../util/util.hpp" -#include "ngraph/src/nnfusion/common/type/data_buffer.hpp" +#include "nnfusion/common/type/data_buffer.hpp" namespace nnfusion { diff --git a/src/nnfusion/frontend/onnx_import/op/constant.hpp b/src/nnfusion/frontend/onnx_import/op/constant.hpp index 4280f9f86..87163617d 100644 --- a/src/nnfusion/frontend/onnx_import/op/constant.hpp +++ b/src/nnfusion/frontend/onnx_import/op/constant.hpp @@ -68,8 +68,7 @@ namespace nnfusion // const auto& func_param = ONNX_CONST_MAP().at(tensor.get_ng_type()); // auto op = func_param(tensor.get_ng_type(), tensor); auto op = std::make_shared( - tensor.get_ng_type(), tensor.get_shape(), tensor.buffer_get_data() - ); + tensor.get_ng_type(), tensor.get_shape(), tensor.buffer_get_data()); op->set_name(node_proto.output(0)); auto gnode = m_graph->add_node_and_edge(op, graph::GNodeVector({})); diff --git a/src/nnfusion/frontend/onnx_import/util/util.hpp b/src/nnfusion/frontend/onnx_import/util/util.hpp index 31bf27726..2f7262842 100644 --- a/src/nnfusion/frontend/onnx_import/util/util.hpp +++ b/src/nnfusion/frontend/onnx_import/util/util.hpp @@ -28,8 +28,8 @@ #include #include "../onnx_base.hpp" -#include "ngraph/src/nnfusion/common/type/data_buffer.hpp" #include "nnfusion/common/common.hpp" +#include "nnfusion/common/type/data_buffer.hpp" namespace nnfusion { From 66abc20a97357bf7c3e9ba78e2327c467a0b06da Mon Sep 17 00:00:00 2001 From: Niupple Date: Tue, 5 Jan 2021 17:07:37 +0800 Subject: [PATCH 27/32] change priority of evaluator runtime --- src/nnfusion/frontend/util/evaluator.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/nnfusion/frontend/util/evaluator.hpp b/src/nnfusion/frontend/util/evaluator.hpp index 46b257922..23d9bc7b8 100644 --- a/src/nnfusion/frontend/util/evaluator.hpp +++ b/src/nnfusion/frontend/util/evaluator.hpp @@ -105,21 +105,21 @@ namespace nnfusion nnfusion::profiler::IProfilingRuntime::Pointer runtime = nullptr; std::vector> kernel_regs; - runtime = nnfusion::profiler::RocmDefaultRuntime::Runtime(); + runtime = nnfusion::profiler::CudaDefaultRuntime::Runtime(); if (runtime->check_env()) { kernel_regs = KernelRegistry::Global()->FindKernelRegistrations( - gnode->get_op_type(), ROCM_GPU, element::f32); - if (kernel_regs.size() == 0) - kernel_regs = KernelRegistry::Global()->FindKernelRegistrations( - gnode->get_op_type(), CUDA_GPU, element::f32); + gnode->get_op_type(), CUDA_GPU, element::f32); } else { - runtime = nnfusion::profiler::CudaDefaultRuntime::Runtime(); + runtime = nnfusion::profiler::RocmDefaultRuntime::Runtime(); NNFUSION_CHECK(runtime->check_env()); kernel_regs = KernelRegistry::Global()->FindKernelRegistrations( - gnode->get_op_type(), CUDA_GPU, element::f32); + gnode->get_op_type(), ROCM_GPU, element::f32); + if (kernel_regs.size() == 0) + kernel_regs = KernelRegistry::Global()->FindKernelRegistrations( + gnode->get_op_type(), CUDA_GPU, element::f32); } bool const_infer_success = false; From f4598b7651ace0561ebdad4aa3ee1da9f5e5f54e Mon Sep 17 00:00:00 2001 From: Niupple Date: Wed, 10 Mar 2021 15:08:56 +0800 Subject: [PATCH 28/32] delete comments --- .../core/kernels/cuda_gpu/kernels/dot.cpp | 254 --------- .../pass/graph/codegen_dxcompute_pass.hpp | 533 ------------------ .../frontend/onnx_import/util/util.cpp | 29 - 3 files changed, 816 deletions(-) delete mode 100644 src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp index b9d90dbbd..712da73c5 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp @@ -206,61 +206,6 @@ LanguageUnit_p cuda::Dot::emit_function_body() } else if (dtype == element::f16) { - // case 1: Scalar * Tensor - // if (arg0_shape.empty() || arg1_shape.empty()) - // { - // auto& second = (arg0_shape.empty() ? arg1_shape : arg0_shape); - // size_t count = nnfusion::shape_size(second); - - // string firstarg = (arg0_shape.empty() ? "input1" : "input0"); - // string secondarg = (arg0_shape.empty() ? "input0" : "input1"); - - // lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n"; - - // lu << "CUDA_SAFE_CALL(cudaMemcpy(outupt0, " << firstarg << ", " << count << ", cudaMemcpyDeviceToDevice));\n"; // copy `firstarg` to `output0` - // lu << "CUBLAS_SAFE_CALL(nnfusionHalfScale(" << secondarg << ", output0, " << count << "));\n"; - // } - // // case 2: 1d Dot - // else if ((arg0_shape.size() == arg1_shape.size()) && (arg0_shape.size() == reduction_axes)) - // { - // for (int i = 0; i < arg0_shape.size(); i++) - // { - // if (arg0_shape[i] != arg1_shape[i]) - // { - // std::vector arg_vec{"arg0", "arg1"}; - // std::vector shape_vec{arg0_shape, arg1_shape}; - - // NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - // << nnfusion::join(shape_vec) << " respectively, at Node " - // << m_context->gnode->get_name() - // << ", do not match for dot op"; - // } - // } - - // size_t count = nnfusion::shape_size(arg0_shape); - // lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n"; - - // lu << "CUBLAS_SAFE_CALL(cublasSdot(cublas_handle, " << count - // << ", static_cast(input0), 1, static_cast(input1), 1, " - // "static_cast(output0)));\n"; - // } - // // matrix * vector - // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1)) - // { - // lu << "const float alpha = 1.0;\n const float beta = 0;\n"; - // lu << "CUBLAS_SAFE_CALL(cublasSgemv(cublas_handle, "; - // if (trans_A) - // lu << "CUBLAS_OP_N, " << arg0_shape[0] << ", " << arg0_shape[1] << ", "; - // else - // lu << "CUBLAS_OP_T, " << arg0_shape[1] << ", " << arg0_shape[0] << ", "; - // lu << " &alpha," - // << " static_cast(input0)," << arg0_shape[1] << ", " - // << " static_cast(input1)," - // << " 1," - // << " &beta," - // << " static_cast(output0)," - // << " 1));\n"; - // } if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) && (trans_A || trans_B)) { @@ -361,205 +306,6 @@ LanguageUnit_p cuda::Dot::emit_function_body() } else { NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot." } - else if (dtype == element::f16) - { - // case 1: Scalar * Tensor - // if (arg0_shape.empty() || arg1_shape.empty()) - // { - // auto& second = (arg0_shape.empty() ? arg1_shape : arg0_shape); - // size_t count = nnfusion::shape_size(second); - - // string firstarg = (arg0_shape.empty() ? "input1" : "input0"); - // string secondarg = (arg0_shape.empty() ? "input0" : "input1"); - - // lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n"; - - // lu << "CUDA_SAFE_CALL(cudaMemcpy(outupt0, " << firstarg << ", " << count << ", cudaMemcpyDeviceToDevice));\n"; // copy `firstarg` to `output0` - // lu << "CUBLAS_SAFE_CALL(nnfusionHalfScale(" << secondarg << ", output0, " << count << "));\n"; - // } - // // case 2: 1d Dot - // else if ((arg0_shape.size() == arg1_shape.size()) && (arg0_shape.size() == reduction_axes)) - // { - // for (int i = 0; i < arg0_shape.size(); i++) - // { - // if (arg0_shape[i] != arg1_shape[i]) - // { - // std::vector arg_vec{"arg0", "arg1"}; - // std::vector shape_vec{arg0_shape, arg1_shape}; - - // NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - // << nnfusion::join(shape_vec) << " respectively, at Node " - // << m_context->gnode->get_name() - // << ", do not match for dot op"; - // } - // } - - // size_t count = nnfusion::shape_size(arg0_shape); - // lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n"; - - // lu << "CUBLAS_SAFE_CALL(cublasSdot(cublas_handle, " << count - // << ", static_cast(input0), 1, static_cast(input1), 1, " - // "static_cast(output0)));\n"; - // } - // // matrix * vector - // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1)) - // { - // lu << "const float alpha = 1.0;\n const float beta = 0;\n"; - // lu << "CUBLAS_SAFE_CALL(cublasSgemv(cublas_handle, "; - // if (trans_A) - // lu << "CUBLAS_OP_N, " << arg0_shape[0] << ", " << arg0_shape[1] << ", "; - // else - // lu << "CUBLAS_OP_T, " << arg0_shape[1] << ", " << arg0_shape[0] << ", "; - // lu << " &alpha," - // << " static_cast(input0)," << arg0_shape[1] << ", " - // << " static_cast(input1)," - // << " 1," - // << " &beta," - // << " static_cast(output0)," - // << " 1));\n"; - // } - // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) && - // (trans_A || trans_B)) - // { - // int m = trans_B ? arg1_shape[0] : arg1_shape[1]; - // int n = trans_A ? arg0_shape[1] : arg0_shape[0]; - // int k = trans_A ? arg0_shape[0] : arg0_shape[1]; - - // lu << "const half alpha = 1.0;\nconst half beta = 0;\n"; - - // lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," - // << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") - // << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << "," - // << " " << n << "," - // << " " << k << "," - // << " &alpha," - // << " static_cast(input1)," - // << " " << arg1_shape[1] << "," - // << " static_cast(input0)," - // << " " << arg0_shape[1] << "," - // << " &beta," - // << " static_cast(output0)," - // << " " << m << "));\n"; - // } else { - size_t axes_for_m_count = arg0_shape.size() - reduction_axes; - size_t axes_for_n_count = arg1_shape.size() - reduction_axes; - size_t axes_for_k_count = reduction_axes; - size_t m = 1; - size_t n = 1; - size_t k = 1; - - // check if input and output size correct - // check and calculate k for arg0 and arg1 - size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k - size_t arg1_k_idx = 0; // first axe in arg1 for k - - for (size_t i = 0; i < axes_for_k_count; i++) - { - int m = trans_B ? arg1_shape[0] : arg1_shape[1]; - int n = trans_A ? arg0_shape[1] : arg0_shape[0]; - int k = trans_A ? arg0_shape[0] : arg0_shape[1]; - - lu << "const half alpha = 1.0;\nconst half beta = 0.;\n"; - - lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," - << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") - << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << "," - << " " << n << "," - << " " << k << "," - << " &alpha," - << " static_cast(input1)," - << " " << arg1_shape[1] << "," - << " static_cast(input0)," - << " " << arg0_shape[1] << "," - << " &beta," - << " static_cast(output0)," - << " " << m << "));\n"; - } - else - { - size_t axes_for_m_count = arg0_shape.size() - reduction_axes; - size_t axes_for_n_count = arg1_shape.size() - reduction_axes; - size_t axes_for_k_count = reduction_axes; - size_t m = 1; - size_t n = 1; - size_t k = 1; - - // check if input and output size correct - // check and calculate k for arg0 and arg1 - size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k - size_t arg1_k_idx = 0; // first axe in arg1 for k - - for (size_t i = 0; i < axes_for_k_count; i++) - { - k *= arg0_shape[arg0_k_idx]; - if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++]) - { - std::vector arg_vec{"arg0", "arg1"}; - std::vector shape_vec{arg0_shape, arg1_shape}; - - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; - } - } - // check and calculate m for arg0 and out - size_t arg0_m_idx = 0; // first axe in arg0 for m - size_t out_m_idx = 0; // first axe in out for m - for (size_t i = 0; i < axes_for_m_count; i++) - { - m *= arg0_shape[arg0_m_idx]; - if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++]) - { - std::vector arg_vec{"arg0", "output"}; - std::vector shape_vec{arg0_shape, out_shape}; - - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; - } - } - // check and calculate n for arg1 and out - size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n - size_t out_n_idx = axes_for_m_count; // first axe in arg1 for n - for (size_t i = 0; i < axes_for_n_count; i++) - { - n *= arg1_shape[arg1_n_idx]; - if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++]) - { - std::vector arg_vec{"arg1", "output"}; - std::vector shape_vec{arg1_shape, out_shape}; - - NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " - << nnfusion::join(shape_vec) << " respectively, at Node " - << m_context->gnode->get_name() - << ", do not match for dot op"; - } - } - - lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; - - lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," - << " CUBLAS_OP_N," - << " CUBLAS_OP_N," - << " " << n << "," - << " " << m << "," - << " " << k << "," - << " &alpha," - << " static_cast(input1)," - << " " << n << "," - << " static_cast(input0)," - << " " << k << "," - << " &beta," - << " static_cast(output0)," - << " " << n << "));\n"; - } - } - else - { - NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for kernel dot."; - } //lu.block_end(); return _lu; } diff --git a/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp b/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp deleted file mode 100644 index 1779ad827..000000000 --- a/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp +++ /dev/null @@ -1,533 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -#pragma once - -#include "graph_pass_base.hpp" -#include "nnfusion/core/operators/generic_op/generic_op.hpp" -#include "nnfusion/core/operators/op_define/constant.hpp" -#include "nnfusion/engine/profiler/profiler.hpp" -#include "nnfusion/util/curl_request.hpp" - -using namespace nnfusion::graph; - -DECLARE_string(fdefault_device); -DECLARE_string(fantares_codegen_server); - -namespace nnfusion -{ - namespace pass - { - namespace graph - { - class DirectComputeCodegenPass : public GraphPassBase - { - std::string currentBackend; - std::string autogen(const std::string& expr) - { - if (FLAGS_fantares_codegen_server == "") - FLAGS_fantares_codegen_server = "10.150.145.98:8884"; - static std::unordered_map code_cache; - std::string response; - auto it = code_cache.find(expr); - if (it == code_cache.end()) - { - CurlRequest req(FLAGS_fantares_codegen_server); - req.add_custom_header(("COMPUTE_V1: " + expr).c_str()); - req.add_custom_header("ARGS: "); - - printf("[Autogen] %s\n", expr.c_str()); - NNFUSION_CHECK(true == req.send_request(response)); - NNFUSION_CHECK(strncmp(response.c_str(), "[ERROR]", 7) != 0) << expr << "\n" - << response; - code_cache[expr] = response; - return std::move(response); - } - else - return it->second; - } - - template - inline std::string - join_collections(const T1& vect, T2 func, bool skip_empty = false) - { - std::stringstream result; - int idx = 0; - for (auto& it : vect) - { - auto str = func(idx, it); - if (!str.size() && skip_empty) - continue; - if (idx > 0) - result << ", "; - result << str; - ++idx; - } - return result.str(); - } - - // inline int get_type_id(nnfusion::element::Type type) - // { - // // TODO: fill more type cases - // if (type == nnfusion::element::f32) - // return DT_FLOAT; - // throw std::runtime_error("Not supported element type."); - // } - - template - inline std::shared_ptr get_op_object(std::shared_ptr& curr) - { - auto _op = static_pointer_cast(curr->get_op_ptr()); - NNFUSION_CHECK_NOT_NULLPTR(_op) << "Node type is not " - << curr->get_op_ptr()->get_op_type(); - return _op; - } - - inline void UNHANDLED_CASE(std::shared_ptr& curr) - { - printf("## Unhandled case for %s:\n", - curr->get_op_ptr()->get_op_type().c_str()); - for (int i = 0; i < curr->get_input_size(); ++i) - printf(">> in-%d : %s\n", - i, - vector_to_string(curr->get_input_shape(i)).c_str()); - for (int i = 0; i < curr->get_output_size(); ++i) - printf(">> out-%d: %s\n", - i, - vector_to_string(curr->get_output_shape(i)).c_str()); - exit(1); - }; - - public: - bool run_on_graph(std::shared_ptr& graph) override - { - currentBackend = "dxcompute"; - - NNFUSION_LOG(INFO) << "Codegen for " << currentBackend << " starts up."; - - auto nodes = graph->get_nodes(); - std::unordered_map, int> din, dout; - - // Count degrees - for (auto& it : nodes) - { - for (auto& in_edge : it->get_in_edges()) - { - if (in_edge->is_control_edge()) - continue; - NNFUSION_CHECK(in_edge->get_dst() == it); - din[it]++; - dout[in_edge->get_src()]++; - } - } - - // Name nodes, legality checks - std::unordered_set> visited, vis_pend, blacklist; - std::unordered_set name_used; - std::unordered_map, std::string> arg_names; - for (auto& it : nodes) - { - NNFUSION_CHECK(it.get() != nullptr); - - auto arg_name = "Z0_" + it->get_op_ptr()->get_op_type() + "_" + - it->get_op_ptr()->get_name(); - for (auto& c : arg_name) - if (!isalpha(c) && !isdigit(c)) - c = '_'; - if (name_used.count(arg_name)) - { - for (int i = 1;; ++i) - { - auto alter = arg_name + "_" + std::to_string(i); - if (!name_used.count(alter)) - { - arg_name = alter; - break; - } - } - } - name_used.insert(arg_name); - arg_names[it] = arg_name; - - if (din[it] == 0 && dout[it] == 0) - visited.insert(it), blacklist.insert(it); - NNFUSION_CHECK(it->get_output_size() == 1); - } - NNFUSION_LOG(INFO) << "There are " << blacklist.size() - << " standalone GNode(s) found."; - name_used.clear(); - - // Fill offsetup nodes - std::deque> gen_q, pend_q; - for (auto& it : nodes) - { - if (visited.count(it)) - continue; - if (din[it] == 0) - { - gen_q.push_back(it); - } - } - - NNFUSION_CHECK( - 0 == - system(("mkdir -p nnfusion_rt/" + currentBackend + "_codegen").c_str())); - - std::ofstream fout("nnfusion_rt/" + currentBackend + "_codegen/nnfusion_rt.h"); - - fout << "#if 1\n\n"; - // Perform blockfusion - int offset = 0, step = 0; - auto new_super_step = [&]() { - while (pend_q.size()) - { - gen_q.push_back(pend_q.front()); - pend_q.pop_front(); - } - if (offset > 0) - ++step, offset = 0; - }; - - auto print_standard_codegen = [&](std::shared_ptr& curr, - std::ofstream& fout, - std::string ir, - std::string options) { - std::string code = autogen(ir); - - if (options.size() > 0) - { - if (options[0] != '|') - options = "|" + options; - if (options.back() != '|') - options += "|"; - } - - if (int(options.find("|memcpy|")) >= 0) - { - NNFUSION_CHECK(curr->get_input_size() == 1); - fout << "NNfusionTensor &" << arg_names[curr] << " = " - << arg_names[curr->get_in_edge(0)->get_src()] << ";\n"; - return; - } - - static std::unordered_map dedupe_kernels; - auto kernel = dedupe_kernels.find(code); - if (kernel == dedupe_kernels.end()) - { - NNFUSION_CHECK(0 == system(("mkdir -p nnfusion_rt/" + currentBackend + - "_codegen/HLSL") - .c_str())); - FILE* fp = fopen(("nnfusion_rt/" + currentBackend + "_codegen/HLSL/" + - arg_names[curr] + ".hlsl") - .c_str(), - "wb"); - NNFUSION_CHECK(fp != nullptr); - NNFUSION_CHECK(code.size() == fwrite(code.c_str(), 1, code.size(), fp)); - fclose(fp); - dedupe_kernels[code] = arg_names[curr]; - kernel = dedupe_kernels.find(code); - } - - fout << "// " << ir << "\n"; - if (int(options.find("|inplace_wg|")) < 0) - { - fout << "NNfusionTensor " << arg_names[curr] << "(device, {" - << join_collections( - curr->get_output_shape(0), - [](int idx, ssize_t it) { return std::to_string(it); }) - << "}, sizeof(" << curr->get_output_element_type(0).c_type_string() - << "));\n"; - - fout << " NNfusionOperator op_" << arg_names[curr] << "(device, {"; - for (int i = 0; i < curr->get_input_size(); ++i) - { - if (i) - fout << ", "; - fout << arg_names[curr->get_in_edge(i)->get_src()]; - } - fout << "}, { " << arg_names[curr] << " }, L\"" << kernel->second - << ".hlsl\");"; - } - else - { - fout << " NNfusionOperator op_" << arg_names[curr] << "(device, {"; - for (int i = 0; i < curr->get_input_size(); ++i) - { - if (i) - fout << ", "; - fout << arg_names[curr->get_in_edge(i)->get_src()]; - } - fout << "}, { " << arg_names[curr->get_in_edge(0)->get_src()] - << " }, L\"" << kernel->second << ".hlsl\");\n"; - fout << "auto& " << arg_names[curr] << " = " - << arg_names[curr->get_in_edge(0)->get_src()] << ";"; - } - }; - - auto codegen_for_elementwise = [&](std::shared_ptr& curr, - std::ofstream& fout, - const std::string& topi, - const std::string& options = "") { - std::string expr = " -"; - for (int i = 0; i < curr->get_input_size(); ++i) - expr += " input(\"input" + std::to_string(i) + "\", @common_shape@);"; - expr += " output(@common_shape@, " + topi + ");"; - - int num_elements = 1, y; - for (auto& it : curr->get_input_shape(0)) - num_elements *= it; - - print_standard_codegen( - curr, - fout, - op::create_code_from_template( - expr, - {{"common_shape", "[ " + std::to_string(num_elements) + " ]"}}), - options); - }; - - std::unordered_map&, std::ofstream&)>> - kernel_dict; - - // Elementwise Ops - kernel_dict["Subtract"] = [&](std::shared_ptr& curr, - std::ofstream& fout) { - codegen_for_elementwise( - curr, fout, "topi=topi.subtract(args(\"input0\"), args(\"input1\"))"); - }; - kernel_dict["Multiply"] = [&](std::shared_ptr& curr, - std::ofstream& fout) { - codegen_for_elementwise( - curr, fout, "topi=topi.multiply(args(\"input0\"), args(\"input1\"))"); - }; - kernel_dict["Divide"] = [&](std::shared_ptr& curr, std::ofstream& fout) { - codegen_for_elementwise( - curr, fout, "topi=topi.divide(args(\"input0\"), args(\"input1\"))"); - }; - kernel_dict["DivNoNan"] = [&](std::shared_ptr& curr, - std::ofstream& fout) { - codegen_for_elementwise( - curr, - fout, - "lambda x: tvm.te.if_then_else(args(\"input1\")[x] != " - "0, args(\"input0\")[x] / args(\"input1\")[x], 0)"); - }; - kernel_dict["Power"] = [&](std::shared_ptr& curr, std::ofstream& fout) { - codegen_for_elementwise( - curr, fout, "topi=topi.power(args(\"input0\"), args(\"input1\"))"); - }; - kernel_dict["LessEq"] = [&](std::shared_ptr& curr, std::ofstream& fout) { - codegen_for_elementwise( - curr, fout, "topi=topi.less_equal(args(\"input0\"), args(\"input1\"))"); - }; - kernel_dict["Equal"] = [&](std::shared_ptr& curr, std::ofstream& fout) { - codegen_for_elementwise( - curr, fout, "topi=topi.equal(args(\"input0\"), args(\"input1\"))"); - }; - kernel_dict["Maximum"] = [&](std::shared_ptr& curr, - std::ofstream& fout) { - codegen_for_elementwise( - curr, fout, "topi=topi.maximum(args(\"input0\"), args(\"input1\"))"); - }; - kernel_dict["Exp"] = [&](std::shared_ptr& curr, std::ofstream& fout) { - codegen_for_elementwise(curr, fout, "topi=topi.exp(args(\"input0\"))"); - }; - kernel_dict["Negative"] = [&](std::shared_ptr& curr, - std::ofstream& fout) { - codegen_for_elementwise(curr, fout, "topi=topi.negative(args(\"input0\"))"); - }; - kernel_dict["Tanh"] = [&](std::shared_ptr& curr, std::ofstream& fout) { - codegen_for_elementwise(curr, fout, "topi=topi.tanh(args(\"input0\"))"); - }; - kernel_dict["Relu6"] = [&](std::shared_ptr& curr, std::ofstream& fout) { - codegen_for_elementwise( - curr, fout, "topi=topi.clip(args(\"input0\"), 0, 6)"); - }; - kernel_dict["Sigmoid"] = [&](std::shared_ptr& curr, - std::ofstream& fout) { - codegen_for_elementwise(curr, fout, "topi=topi.sigmoid(args(\"input0\"))"); - }; - kernel_dict["Square"] = [&](std::shared_ptr& curr, std::ofstream& fout) { - codegen_for_elementwise( - curr, fout, "topi=topi.multiply(args(\"input0\"), args(\"input0\"))"); - }; - kernel_dict["Rsqrt"] = [&](std::shared_ptr& curr, std::ofstream& fout) { - codegen_for_elementwise(curr, fout, "topi=topi.rsqrt(args(\"input0\"))"); - }; - kernel_dict["Log"] = [&](std::shared_ptr& curr, std::ofstream& fout) { - codegen_for_elementwise(curr, fout, "topi=topi.log(args(\"input0\"))"); - }; - kernel_dict["ReluBackprop"] = [&](std::shared_ptr& curr, - std::ofstream& fout) { - codegen_for_elementwise( - curr, - fout, - "lambda x: tvm.te.if_then_else(args(\"input0\")[x] > " - "0, args(\"input1\")[x], 0)"); - }; - kernel_dict["Select"] = [&](std::shared_ptr& curr, std::ofstream& fout) { - codegen_for_elementwise( - curr, - fout, - "lambda x: tvm.te.if_then_else(args(\"input0\")[x] == " - "0, args(\"input2\")[x], args(\"input1\")[x])"); - }; - - // Non-standard Ops - kernel_dict["Constant"] = [&](std::shared_ptr& curr, - std::ofstream& fout) { - auto p_const = std::dynamic_pointer_cast(curr->get_op_ptr()); - NNFUSION_CHECK(p_const != nullptr); - const void* dptr = p_const->get_data_ptr(); - size_t size = p_const->get_data_size(); - - NNFUSION_CHECK(0 == system(("mkdir -p nnfusion_rt/" + currentBackend + - "_codegen/Constant") - .c_str())); - FILE* fp = fopen(("nnfusion_rt/" + currentBackend + "_codegen/Constant/" + - arg_names[curr]) - .c_str(), - "wb"); - NNFUSION_CHECK(fp != nullptr); - NNFUSION_CHECK(size == fwrite(dptr, 1, size, fp)); - fclose(fp); - - fout << "NNfusionTensor " << arg_names[curr] << "(device, {" - << join_collections( - curr->get_output_shape(0), - [](int idx, ssize_t it) { return std::to_string(it); }) - << "}, sizeof(" << curr->get_output_element_type(0).c_type_string() - << "));\n"; - - fout << " NNfusionMemcpy op_" << arg_names[curr] << "(device, " - << arg_names[curr] << ", load_data<" - << curr->get_output_element_type(0).c_type_string() << ">(\"" - << arg_names[curr] << "\", " << arg_names[curr] - << ".NumElements()), true);\n"; - }; - - kernel_dict["Parameter"] = [&](std::shared_ptr& curr, - std::ofstream& fout) { - fout << "NNfusionTensor " << arg_names[curr] << "(device, {" - << join_collections( - curr->get_output_shape(0), - [](int idx, ssize_t it) { return std::to_string(it); }) - << "}, sizeof(" << curr->get_output_element_type(0).c_type_string() - << "));\n"; - - fout << " NNfusionMemcpy op_" << arg_names[curr] << "(device, " - << arg_names[curr] << ", load_data<" - << curr->get_output_element_type(0).c_type_string() << ">(\"\", " - << arg_names[curr] << ".NumElements()));\n"; - }; - - kernel_dict["Result"] = [&](std::shared_ptr& curr, std::ofstream& fout) { - fout << "NNfusionMemcpy " << arg_names[curr] << "(device, nullptr, " - << arg_names[curr->get_in_edge(0)->get_src()] << ");\n"; - }; - - while (gen_q.size() > 0 || pend_q.size() > 0) - { - // Move to new super step if satisifed - if (!gen_q.size()) - new_super_step(); - - auto curr = gen_q.front(); - gen_q.pop_front(); - visited.insert(curr); - - auto entry = kernel_dict.find(curr->get_op_ptr()->get_op_type()); - if (entry != kernel_dict.end()) - entry->second(curr, fout); - else - { - auto ir = nnfusion::op::get_translation_v2(curr); - if (ir.empty()) - ir = nnfusion::op::get_translation(curr); - if (ir != "") - { - const char annotation[] = "## @annotation: "; - int pos = ir.find(annotation); - std::string options; - if (pos >= 0) - { - pos += sizeof(annotation) - 1; - options = ir.substr(pos); - } - print_standard_codegen(curr, fout, ir, options); - } - else - UNHANDLED_CASE(curr); - } - fout << std::endl; - - // Check its children about whether all inputs are ready (Must be put after any possible new_super_step()) - for (auto& edge : curr->get_out_edges()) - { - if (edge->is_control_edge()) - continue; - NNFUSION_CHECK(edge->get_src() == curr); - NNFUSION_CHECK(visited.count(edge->get_dst()) == 0); - - bool ready = true; - for (auto& from : edge->get_dst()->get_in_edges()) - { - if (from->is_control_edge()) - continue; - if (visited.count(from->get_src()) == 0) - { - ready = false; - break; - } - } - if (ready) - { - // Only join pend_q once - if (vis_pend.count(edge->get_dst()) == 0) - { - vis_pend.insert(edge->get_dst()); - pend_q.push_back(edge->get_dst()); - } - } - } - } - - fout << "#endif\n\n"; - fout << R"( - device.pCommandQueue->ExecuteCommandLists(preloadQueue.size(), preloadQueue.data()); - device.pCommandQueue->ExecuteCommandLists(cmdQueue.size(), cmdQueue.data()); - device.AwaitExecution(); -)"; - // Print Results - for (auto& curr : graph->get_outputs()) // Print output nodes - { - if (blacklist.count(curr)) - continue; - fout << arg_names[curr] << ".PrintStageBuffer<" - << curr->get_output_element_type(0).c_type_string() << ">(device, \"" - << arg_names[curr] << "\");\n"; - } - - fout << std::endl; - - nnfusion::codegen::copy_file_from_templates( - currentBackend + "/DxCompute.vcxproj", - "nnfusion_rt/" + currentBackend + "_codegen/DxCompute.vcxproj"); - nnfusion::codegen::copy_file_from_templates(currentBackend + "/run_graph.cpp", - "nnfusion_rt/" + currentBackend + - "_codegen/run_graph.cpp"); - nnfusion::codegen::copy_file_from_templates(currentBackend + "/d3dx12_helper.h", - "nnfusion_rt/" + currentBackend + - "_codegen/d3dx12_helper.h"); - nnfusion::codegen::copy_file_from_templates( - currentBackend + "/d3dx12_nnfusion.h", - "nnfusion_rt/" + currentBackend + "_codegen/d3dx12_nnfusion.h"); - NNFUSION_LOG(INFO) << currentBackend << " codegen finished."; - exit(0); - return true; - } - }; - } // namespace pass - } // namespace graph -} // namespace nnfusion diff --git a/src/nnfusion/frontend/onnx_import/util/util.cpp b/src/nnfusion/frontend/onnx_import/util/util.cpp index 28a56620f..6cd9f4316 100644 --- a/src/nnfusion/frontend/onnx_import/util/util.cpp +++ b/src/nnfusion/frontend/onnx_import/util/util.cpp @@ -91,35 +91,6 @@ namespace nnfusion element::Type element_type = tensor.get_ng_type(); return std::make_shared( element_type, shape, tensor.buffer_get_data()); - // switch (onnx_et) - // { - // case onnx::TensorProto_DataType::TensorProto_DataType_BOOL: - // return make_constant_op(element::boolean, shape, tensor); - // case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT: - // case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16: - // return make_constant_op(element::f32, shape, tensor); - // case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE: - // return make_constant_op(element::f64, shape, tensor); - // case onnx::TensorProto_DataType::TensorProto_DataType_INT8: - // return make_constant_op(element::i8, shape, tensor); - // case onnx::TensorProto_DataType::TensorProto_DataType_INT16: - // return make_constant_op(element::i16, shape, tensor); - // case onnx::TensorProto_DataType::TensorProto_DataType_INT32: - // return make_constant_op(element::i32, shape, tensor); - // case onnx::TensorProto_DataType::TensorProto_DataType_INT64: - // return make_constant_op(element::i64, shape, tensor); - // case onnx::TensorProto_DataType::TensorProto_DataType_UINT8: - // return make_constant_op(element::u8, shape, tensor); - // case onnx::TensorProto_DataType::TensorProto_DataType_UINT16: - // return make_constant_op(element::u16, shape, tensor); - // case onnx::TensorProto_DataType::TensorProto_DataType_UINT32: - // return make_constant_op(element::u32, shape, tensor); - // case onnx::TensorProto_DataType::TensorProto_DataType_UINT64: - // return make_constant_op(element::u64, shape, tensor); - // default: - // NNFUSION_CHECK_FAIL() << "unsupported value info element type: " - // << onnx::TensorProto_DataType_Name(onnx_et); - // } } std::shared_ptr GetInputNode(const NodeMap& all_ng_nodes, From ef5276726434c107f4a226cfd072c72099622755 Mon Sep 17 00:00:00 2001 From: Niupple Date: Wed, 10 Mar 2021 15:11:58 +0800 Subject: [PATCH 29/32] codesytle --- .../core/kernels/cuda_gpu/kernels/dot.cpp | 161 +++++++++--------- .../core/kernels/kernel_registration.cpp | 1 - 2 files changed, 81 insertions(+), 81 deletions(-) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp index 712da73c5..604e35e7b 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp @@ -285,92 +285,93 @@ LanguageUnit_p cuda::Dot::emit_function_body() std::vector arg_vec{"arg1", "output"}; std::vector shape_vec{arg1_shape, out_shape}; - lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; - - lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," - << " CUBLAS_OP_N," - << " CUBLAS_OP_N," - << " " << n << "," - << " " << m << "," - << " " << k << "," - << " &alpha," - << " static_cast(input1)," - << " " << n << "," - << " static_cast(input0)," - << " " << k << "," - << " &beta," - << " static_cast(output0)," - << " " << n << "));\n"; - // } - - } else { - NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot." - } - //lu.block_end(); - return _lu; -} + lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; + + lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," + << " CUBLAS_OP_N," + << " CUBLAS_OP_N," + << " " << n << "," + << " " << m << "," + << " " << k << "," + << " &alpha," + << " static_cast(input1)," + << " " << n << "," + << " static_cast(input0)," + << " " << k << "," + << " &beta," + << " static_cast(output0)," + << " " << n << "));\n"; + // } + } + else + { + NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot." + } + //lu.block_end(); + return _lu; + } -LanguageUnit_p cuda::Dot::emit_dependency() -{ - LanguageUnit_p _lu(new LanguageUnit(get_function_name() + "_dep")); - _lu->require(header::cuda); - _lu->require(header::cublas); - _lu->require(header::stdexcept); - _lu->require(header::sstream); - _lu->require(macro::CUBLAS_SAFE_CALL); - _lu->require(macro::CUDA_SAFE_CALL); - // _lu->require(declaration::cuda_fp16_scale); - //_lu->require(declaration::cublas_handle); - return _lu; -} + LanguageUnit_p cuda::Dot::emit_dependency() + { + LanguageUnit_p _lu(new LanguageUnit(get_function_name() + "_dep")); + _lu->require(header::cuda); + _lu->require(header::cublas); + _lu->require(header::stdexcept); + _lu->require(header::sstream); + _lu->require(macro::CUBLAS_SAFE_CALL); + _lu->require(macro::CUDA_SAFE_CALL); + // _lu->require(declaration::cuda_fp16_scale); + //_lu->require(declaration::cublas_handle); + return _lu; + } -LanguageUnit_p cuda::Dot::emit_function_signature() -{ - LanguageUnit_p _lu(new LanguageUnit(this->m_kernel_name + "_sig")); - auto& lu = *_lu; + LanguageUnit_p cuda::Dot::emit_function_signature() + { + LanguageUnit_p _lu(new LanguageUnit(this->m_kernel_name + "_sig")); + auto& lu = *_lu; - vector params; - for (size_t i = 0; i < m_context->inputs.size(); i++) - { - stringstream ss; - ss << m_context->inputs[i]->get_element_type().c_type_string() << "* "; - ss << "input" << i; - params.push_back(ss.str()); - } + vector params; + for (size_t i = 0; i < m_context->inputs.size(); i++) + { + stringstream ss; + ss << m_context->inputs[i]->get_element_type().c_type_string() << "* "; + ss << "input" << i; + params.push_back(ss.str()); + } - for (size_t i = 0; i < m_context->outputs.size(); i++) - { - stringstream ss; - ss << m_context->outputs[i]->get_element_type().c_type_string() << "* "; - ss << "output" << i; - params.push_back(ss.str()); - } + for (size_t i = 0; i < m_context->outputs.size(); i++) + { + stringstream ss; + ss << m_context->outputs[i]->get_element_type().c_type_string() << "* "; + ss << "output" << i; + params.push_back(ss.str()); + } - for (size_t i = 0; i < m_context->tensors.size(); i++) - { - stringstream ss; - ss << m_context->tensors[i]->get_element_type().c_type_string() << "* "; - // defult name is: "persit0", "persist1" ... - ss << m_context->tensors[i]->get_name(); - params.push_back(ss.str()); - } + for (size_t i = 0; i < m_context->tensors.size(); i++) + { + stringstream ss; + ss << m_context->tensors[i]->get_element_type().c_type_string() << "* "; + // defult name is: "persit0", "persist1" ... + ss << m_context->tensors[i]->get_name(); + params.push_back(ss.str()); + } - lu << "void " - << "(cublasHandle_t cublas_handle, " << join(params, ", ") << ")"; - return _lu; -} + lu << "void " + << "(cublasHandle_t cublas_handle, " << join(params, ", ") << ")"; + return _lu; + } -REGISTER_KERNEL_EMITTER( - "Dot", // op_name - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs - cuda::Dot) // constructor + REGISTER_KERNEL_EMITTER( + "Dot", // op_name + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs + cuda::Dot) // constructor -REGISTER_KERNEL_EMITTER( - "Dot", // op_name - Device(CUDA_GPU).TypeConstraint(element::f16).Tag("cublas").Priority(2), // attrs - cuda::Dot) // constructor + REGISTER_KERNEL_EMITTER( + "Dot", // op_name + Device(CUDA_GPU).TypeConstraint(element::f16).Tag("cublas").Priority(2), // attrs + cuda::Dot) // constructor -REGISTER_KERNEL_EMITTER( - "Dot", // op_name - Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs - cuda::Dot) // constructor + REGISTER_KERNEL_EMITTER( + "Dot", // op_name + Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs + cuda::Dot) // constructor diff --git a/src/nnfusion/core/kernels/kernel_registration.cpp b/src/nnfusion/core/kernels/kernel_registration.cpp index 786c41d87..79d2a17c2 100644 --- a/src/nnfusion/core/kernels/kernel_registration.cpp +++ b/src/nnfusion/core/kernels/kernel_registration.cpp @@ -4,7 +4,6 @@ #include "kernel_registration.hpp" #include "nnfusion/common/type/element_type.hpp" #include "nnfusion/util/util.hpp" -#include "ngraph/src/nnfusion/common/type/element_type.hpp" #include "nnfusion/util/util.hpp" using namespace nnfusion; From 80904a24b47947b9e311999969c03103d6e3e7df Mon Sep 17 00:00:00 2001 From: Niupple Date: Wed, 10 Mar 2021 15:24:59 +0800 Subject: [PATCH 30/32] fix dot --- .../core/kernels/cuda_gpu/kernels/dot.cpp | 165 +++++++++--------- 1 file changed, 86 insertions(+), 79 deletions(-) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp index 604e35e7b..5896bbcae 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp @@ -285,93 +285,100 @@ LanguageUnit_p cuda::Dot::emit_function_body() std::vector arg_vec{"arg1", "output"}; std::vector shape_vec{arg1_shape, out_shape}; - lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; - - lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," - << " CUBLAS_OP_N," - << " CUBLAS_OP_N," - << " " << n << "," - << " " << m << "," - << " " << k << "," - << " &alpha," - << " static_cast(input1)," - << " " << n << "," - << " static_cast(input0)," - << " " << k << "," - << " &beta," - << " static_cast(output0)," - << " " << n << "));\n"; - // } - } - else - { - NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot." + NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with " + << nnfusion::join(shape_vec) << " respectively, at Node " + << m_context->gnode->get_name() + << ", do not match for dot op"; } - //lu.block_end(); - return _lu; } - LanguageUnit_p cuda::Dot::emit_dependency() - { - LanguageUnit_p _lu(new LanguageUnit(get_function_name() + "_dep")); - _lu->require(header::cuda); - _lu->require(header::cublas); - _lu->require(header::stdexcept); - _lu->require(header::sstream); - _lu->require(macro::CUBLAS_SAFE_CALL); - _lu->require(macro::CUDA_SAFE_CALL); - // _lu->require(declaration::cuda_fp16_scale); - //_lu->require(declaration::cublas_handle); - return _lu; - } + lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n"; - LanguageUnit_p cuda::Dot::emit_function_signature() - { - LanguageUnit_p _lu(new LanguageUnit(this->m_kernel_name + "_sig")); - auto& lu = *_lu; + lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle," + << " CUBLAS_OP_N," + << " CUBLAS_OP_N," + << " " << n << "," + << " " << m << "," + << " " << k << "," + << " &alpha," + << " static_cast(input1)," + << " " << n << "," + << " static_cast(input0)," + << " " << k << "," + << " &beta," + << " static_cast(output0)," + << " " << n << "));\n"; + } + } + else + { + NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot." + } + //lu.block_end(); + return _lu; +} - vector params; - for (size_t i = 0; i < m_context->inputs.size(); i++) - { - stringstream ss; - ss << m_context->inputs[i]->get_element_type().c_type_string() << "* "; - ss << "input" << i; - params.push_back(ss.str()); - } +LanguageUnit_p cuda::Dot::emit_dependency() +{ + LanguageUnit_p _lu(new LanguageUnit(get_function_name() + "_dep")); + _lu->require(header::cuda); + _lu->require(header::cublas); + _lu->require(header::stdexcept); + _lu->require(header::sstream); + _lu->require(macro::CUBLAS_SAFE_CALL); + _lu->require(macro::CUDA_SAFE_CALL); + // _lu->require(declaration::cuda_fp16_scale); + //_lu->require(declaration::cublas_handle); + return _lu; +} - for (size_t i = 0; i < m_context->outputs.size(); i++) - { - stringstream ss; - ss << m_context->outputs[i]->get_element_type().c_type_string() << "* "; - ss << "output" << i; - params.push_back(ss.str()); - } +LanguageUnit_p cuda::Dot::emit_function_signature() +{ + LanguageUnit_p _lu(new LanguageUnit(this->m_kernel_name + "_sig")); + auto& lu = *_lu; - for (size_t i = 0; i < m_context->tensors.size(); i++) - { - stringstream ss; - ss << m_context->tensors[i]->get_element_type().c_type_string() << "* "; - // defult name is: "persit0", "persist1" ... - ss << m_context->tensors[i]->get_name(); - params.push_back(ss.str()); - } + vector params; + for (size_t i = 0; i < m_context->inputs.size(); i++) + { + stringstream ss; + ss << m_context->inputs[i]->get_element_type().c_type_string() << "* "; + ss << "input" << i; + params.push_back(ss.str()); + } - lu << "void " - << "(cublasHandle_t cublas_handle, " << join(params, ", ") << ")"; - return _lu; - } + for (size_t i = 0; i < m_context->outputs.size(); i++) + { + stringstream ss; + ss << m_context->outputs[i]->get_element_type().c_type_string() << "* "; + ss << "output" << i; + params.push_back(ss.str()); + } + + for (size_t i = 0; i < m_context->tensors.size(); i++) + { + stringstream ss; + ss << m_context->tensors[i]->get_element_type().c_type_string() << "* "; + // defult name is: "persit0", "persist1" ... + ss << m_context->tensors[i]->get_name(); + params.push_back(ss.str()); + } + + lu << "void " + << "(cublasHandle_t cublas_handle, " << join(params, ", ") << ")"; + return _lu; +} - REGISTER_KERNEL_EMITTER( - "Dot", // op_name - Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs - cuda::Dot) // constructor +REGISTER_KERNEL_EMITTER( + "Dot", // op_name + Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs + cuda::Dot) // constructor - REGISTER_KERNEL_EMITTER( - "Dot", // op_name - Device(CUDA_GPU).TypeConstraint(element::f16).Tag("cublas").Priority(2), // attrs - cuda::Dot) // constructor +REGISTER_KERNEL_EMITTER( + "Dot", // op_name + Device(CUDA_GPU).TypeConstraint(element::f16).Tag("cublas").Priority(2), // attrs + cuda::Dot) // constructor - REGISTER_KERNEL_EMITTER( - "Dot", // op_name - Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs - cuda::Dot) // constructor +REGISTER_KERNEL_EMITTER( + "Dot", // op_name + Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs + cuda::Dot) // constructor From 65cc6f7fbeb4bb1a2b73ffbc303d35e479161a3e Mon Sep 17 00:00:00 2001 From: Niupple Date: Wed, 10 Mar 2021 15:37:46 +0800 Subject: [PATCH 31/32] repetitive include --- src/nnfusion/core/kernels/kernel_registration.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/nnfusion/core/kernels/kernel_registration.cpp b/src/nnfusion/core/kernels/kernel_registration.cpp index 79d2a17c2..67ded5b8f 100644 --- a/src/nnfusion/core/kernels/kernel_registration.cpp +++ b/src/nnfusion/core/kernels/kernel_registration.cpp @@ -4,7 +4,6 @@ #include "kernel_registration.hpp" #include "nnfusion/common/type/element_type.hpp" #include "nnfusion/util/util.hpp" -#include "nnfusion/util/util.hpp" using namespace nnfusion; using namespace nnfusion::kernels; From 40716bb6effe4d7a9ef064570085d0f9ddfa11cd Mon Sep 17 00:00:00 2001 From: Niupple Date: Wed, 10 Mar 2021 16:47:03 +0800 Subject: [PATCH 32/32] fix a semicolon missing --- src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp index 5896bbcae..9d84ff321 100644 --- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp +++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp @@ -312,7 +312,7 @@ LanguageUnit_p cuda::Dot::emit_function_body() } else { - NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot." + NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot."; } //lu.block_end(); return _lu;