From 1e112ac81c953371ed8e8c686c077c0a5aa2a87f Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Wed, 25 Nov 2020 05:08:23 +0000
Subject: [PATCH 01/32] refactor and add some fp16

---
 .../core/operators/generic_op/generic_op.hpp  |  17 ++-
 .../frontend/tensorflow_import/ops/const.cpp  | 128 +++++++-----------
 .../frontend/tensorflow_import/ops/const.hpp  |   2 +-
 .../tensorflow_import/util/graph_convert.cpp  |   2 +-
 .../frontend/tensorflow_import/util/util.cpp  |   1 +
 5 files changed, 66 insertions(+), 84 deletions(-)
diff --git a/src/nnfusion/core/operators/generic_op/generic_op.hpp b/src/nnfusion/core/operators/generic_op/generic_op.hpp
index fa500a93a..831457e32 100644
--- a/src/nnfusion/core/operators/generic_op/generic_op.hpp
+++ b/src/nnfusion/core/operators/generic_op/generic_op.hpp
@@ -5,7 +5,9 @@
 
 #include <iomanip>
 #include <limits>
+
 #include "nnfusion/common/common.hpp"
+#include "ngraph/src/nnfusion/common/type/element_type.hpp"
 
 #define REGISTER_OP(op_x)                                                                          \
     static nnfusion::op::OpConfig __register_op_##op_x = nnfusion::op::build_op_config(#op_x)
@@ -200,23 +202,26 @@ namespace nnfusion
         {
             alias_name = alias_name.empty() ? input_name : alias_name;
             config[alias_name] = input_name;
-            auto d_type = tensor->get_element_type().c_type_string();
-            if (d_type == "float")
+            auto d_type = tensor->get_element_type();
+            if (d_type == element::f32)
             {
                 config[alias_name + "_dtype"] = "float32";
             }
-            else if (d_type == "int32_t")
+            else if (d_type == element::i32)
             {
                 config[alias_name + "_dtype"] = "int32";
             }
-            else if (d_type == "int64_t")
+            else if (d_type == element::i64)
             {
                 config[alias_name + "_dtype"] = "int64";
             }
+            else if (d_type == element::f16)
+            {
+                config[alias_name + "_dtype"] = "float16";
+            }
             else
             {
-                printf("Unhandled type: %s\n", d_type.c_str());
-                assert(0);
+                NNFUSION_CHECK_FAIL_WITH_EXCEPTION() << "Unhandled type: " << d_type.c_str();
             }
             auto shape = tensor->get_shape();
             if (shape.size() == 0)
diff --git a/src/nnfusion/frontend/tensorflow_import/ops/const.cpp b/src/nnfusion/frontend/tensorflow_import/ops/const.cpp
index b936c0b10..9dba39c21 100644
--- a/src/nnfusion/frontend/tensorflow_import/ops/const.cpp
+++ b/src/nnfusion/frontend/tensorflow_import/ops/const.cpp
@@ -167,72 +167,46 @@ namespace nnfusion
                 // int_val, float_val, etc.
                 if (tensor_content_size == 0)
                 {
+
+#define GET_VALUES(type) do {                                               \
+                        const void* dat = nullptr;                                              \
+                        for (size_t i = 0; i < n_elements; ++i) {                               \
+                            if (tensor.type##_val_size() == 1) {                                \
+                                dat = reinterpret_cast<const void *>(&tensor.type##_val()[0]);  \
+                            } else {                                                            \
+                                dat = reinterpret_cast<const void *>(&tensor.type##_val()[i]);  \
+                            }                                                                   \
+                            values->setElement(i, dat);                                         \
+                        }                                                                       \
+                    } while(0)
+
                     values->resize(n_elements);
-                    for (size_t i = 0; i < n_elements; i++)
-                    {
-                        auto& tensor = node.attr().at("value").tensor();
-                        const void* dat = nullptr;
-                        switch (dt)
+                    auto& tensor = node.attr().at("value").tensor();
+                    size_t val_size;
+                    if (dt == tensorflow::DT_INT32) {
+                        GET_VALUES(int);
+                    } else if (dt == tensorflow::DT_INT64) {
+                        GET_VALUES(int64);
+                    } else if (dt == tensorflow::DT_BOOL) {
+                        GET_VALUES(bool);
+                    } else if (dt == tensorflow::DT_HALF) {
+                        GET_VALUES(half);
+                    } else if (dt == tensorflow::DT_FLOAT) {
+                        GET_VALUES(float);
+                    } else if (dt == tensorflow::DT_DOUBLE) {
+                        GET_VALUES(double);
+                    } else if (dt == tensorflow::DT_STRING) {
+                        values->resize(tensor.string_val()[0].length());
+                        auto it = tensor.string_val()[0].begin();
+                        for (size_t j = 0; it != tensor.string_val()[0].end(); ++j, ++it)
                         {
-                        // TODO(amprocte/NGRAPH-2502): there are more element types to support
-                        // here
-                        case tensorflow::DT_INT32:
-                            dat = reinterpret_cast<const void*>(&(tensor.int_val_size() == 1
-                                                                      ? tensor.int_val()[0]
-                                                                      : tensor.int_val()[i]));
-                            values->setElement(i, dat);
-                            break;
-                        case tensorflow::DT_INT64:
-                            dat = reinterpret_cast<const void*>(&(tensor.int64_val_size() == 1
-                                                                      ? tensor.int64_val()[0]
-                                                                      : tensor.int64_val()[i]));
-                            values->setElement(i, dat);
-                            break;
-                        case tensorflow::DT_FLOAT:
-                            dat = reinterpret_cast<const void*>(&(tensor.float_val_size() == 1
-                                                                      ? tensor.float_val()[0]
-                                                                      : tensor.float_val()[i]));
-                            values->setElement(i, dat);
-                            break;
-                        case tensorflow::DT_BOOL:
-                            dat = reinterpret_cast<const void*>(&(tensor.bool_val_size() == 1
-                                                                      ? tensor.bool_val()[0]
-                                                                      : tensor.bool_val()[i]));
-                            values->setElement(i, dat);
-                            break;
-                        case tensorflow::DT_DOUBLE:
-                            dat = reinterpret_cast<const void*>(&(tensor.double_val_size() == 1
-                                                                      ? tensor.double_val()[0]
-                                                                      : tensor.double_val()[i]));
-                            values->setElement(i, dat);
-                            break;
-                        case tensorflow::DT_STRING:
-                            if (i > 0)
-                            {
-                                // TODO: only support one dimension for string type now
-                                return false;
-                            }
-                            {
-                                values->resize(tensor.string_val()[0].length());
-                                auto it = tensor.string_val()[0].begin();
-                                for (size_t j = 0; it != tensor.string_val()[0].end(); ++j, ++it)
-                                {
-                                    values->setElement(j, reinterpret_cast<const void*>(&it));
-                                }
-                            }
-                            break;
-                        default:
-                            return false;
-                            // NGRAPH_VLOG(0)
-                            //     << "Const node has empty tensor and we don't know how to "
-                            //        "handle this element type";
-                            // NGRAPH_VLOG(0) << node.DebugString();
-                            // NGRAPH_VLOG(0) << shape.DebugString();
-                            // return errors::Unimplemented("Encountered unknown element type ",
-                            //                              DataType_Name(dt),
-                            //                              " on an empty tensor");
+                            values->setElement(j, reinterpret_cast<const void*>(&it));
                         }
+                    } else {
+                        return false;
                     }
+
+#undef GET_VALUES
                 }
                 else
                 {
@@ -372,7 +346,9 @@ namespace nnfusion
 
                 try
                 {
-                    const auto& type = TF_NGRAPH_CONST_MAP.at(dtype);
+                    element::Type type;
+                    result = TFDataTypeToNNFusionElementType(dtype, &type);
+                    NNFUSION_CHECK(result);
                     result = MakeConstOp(node, type, &ng_node);
                     NNFUSION_CHECK(result);
                 }
@@ -388,19 +364,19 @@ namespace nnfusion
                 return ret;
             }
 
-            const std::map<tensorflow::DataType, element::Type> TF_NGRAPH_CONST_MAP = {
-                {tensorflow::DataType::DT_FLOAT, nnfusion::element::f32},
-                {tensorflow::DataType::DT_DOUBLE, nnfusion::element::f64},
-                {tensorflow::DataType::DT_INT8, nnfusion::element::i8},
-                {tensorflow::DataType::DT_INT16, nnfusion::element::i16},
-                {tensorflow::DataType::DT_INT32, nnfusion::element::i32},
-                {tensorflow::DataType::DT_INT64, nnfusion::element::i64},
-                {tensorflow::DataType::DT_UINT8, nnfusion::element::u8},
-                {tensorflow::DataType::DT_UINT16, nnfusion::element::u16},
-                {tensorflow::DataType::DT_UINT32, nnfusion::element::u32},
-                {tensorflow::DataType::DT_UINT64, nnfusion::element::u64},
-                {tensorflow::DataType::DT_BOOL, nnfusion::element::boolean},
-                {tensorflow::DataType::DT_STRING, nnfusion::element::character}};
+            // const std::map<tensorflow::DataType, element::Type> TF_NGRAPH_CONST_MAP = {
+            //     {tensorflow::DataType::DT_FLOAT, nnfusion::element::f32},
+            //     {tensorflow::DataType::DT_DOUBLE, nnfusion::element::f64},
+            //     {tensorflow::DataType::DT_INT8, nnfusion::element::i8},
+            //     {tensorflow::DataType::DT_INT16, nnfusion::element::i16},
+            //     {tensorflow::DataType::DT_INT32, nnfusion::element::i32},
+            //     {tensorflow::DataType::DT_INT64, nnfusion::element::i64},
+            //     {tensorflow::DataType::DT_UINT8, nnfusion::element::u8},
+            //     {tensorflow::DataType::DT_UINT16, nnfusion::element::u16},
+            //     {tensorflow::DataType::DT_UINT32, nnfusion::element::u32},
+            //     {tensorflow::DataType::DT_UINT64, nnfusion::element::u64},
+            //     {tensorflow::DataType::DT_BOOL, nnfusion::element::boolean},
+            //     {tensorflow::DataType::DT_STRING, nnfusion::element::character}};
         } // namespace tensorflow_import
     }     // namespace frontend
 } // namespace nnfusion
diff --git a/src/nnfusion/frontend/tensorflow_import/ops/const.hpp b/src/nnfusion/frontend/tensorflow_import/ops/const.hpp
index 6b86a225d..827dc76be 100644
--- a/src/nnfusion/frontend/tensorflow_import/ops/const.hpp
+++ b/src/nnfusion/frontend/tensorflow_import/ops/const.hpp
@@ -19,7 +19,7 @@ namespace nnfusion
                                              const NodeMap& all_ng_nodes,
                                              std::shared_ptr<nnfusion::graph::Graph> m_ngraph);
 
-            extern const std::map<tensorflow::DataType, element::Type> TF_NGRAPH_CONST_MAP;
+            // extern const std::map<tensorflow::DataType, element::Type> TF_NGRAPH_CONST_MAP;
         } // namespace tensorflow_import
     }     // namespace frontend
 } // namespace nnfusion
diff --git a/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp b/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp
index 12b325b5e..66ba79696 100644
--- a/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp
+++ b/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp
@@ -165,7 +165,7 @@ namespace nnfusion
                 NNFUSION_CHECK(status);
                 nnfusion::element::Type nnfusion_et;
                 status = TFDataTypeToNNFusionElementType(dtype, &nnfusion_et);
-                NNFUSION_CHECK(status);
+                NNFUSION_CHECK(status) << "DataType " << dtype << " is not supported.";
                 tensorflow::TensorShapeProto tf_shape = node.attr().at("shape").shape();
                 nnfusion::Shape ng_shape;
                 status = TFTensorShapeToNGraphShape(tf_shape, &ng_shape);
diff --git a/src/nnfusion/frontend/tensorflow_import/util/util.cpp b/src/nnfusion/frontend/tensorflow_import/util/util.cpp
index e060cf97b..9531b06ea 100644
--- a/src/nnfusion/frontend/tensorflow_import/util/util.cpp
+++ b/src/nnfusion/frontend/tensorflow_import/util/util.cpp
@@ -14,6 +14,7 @@ namespace nnfusion
             {
                 switch (tf_dt)
                 {
+                case tensorflow::DataType::DT_HALF: *ng_et = nnfusion::element::f16; break;
                 case tensorflow::DataType::DT_FLOAT: *ng_et = nnfusion::element::f32; break;
                 case tensorflow::DataType::DT_DOUBLE: *ng_et = nnfusion::element::f64; break;
                 case tensorflow::DataType::DT_INT8: *ng_et = nnfusion::element::i8; break;

From de54196168067d366950a95c052d923fe4d2c837 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Wed, 25 Nov 2020 05:29:20 +0000
Subject: [PATCH 02/32] fix compiling error

---
 src/nnfusion/core/operators/generic_op/generic_op.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnfusion/core/operators/generic_op/generic_op.hpp b/src/nnfusion/core/operators/generic_op/generic_op.hpp
index 831457e32..2a9f36171 100644
--- a/src/nnfusion/core/operators/generic_op/generic_op.hpp
+++ b/src/nnfusion/core/operators/generic_op/generic_op.hpp
@@ -221,7 +221,7 @@ namespace nnfusion
             }
             else
             {
-                NNFUSION_CHECK_FAIL_WITH_EXCEPTION() << "Unhandled type: " << d_type.c_str();
+                NNFUSION_CHECK_FAIL() << "Unhandled type: " << d_type;
             }
             auto shape = tensor->get_shape();
             if (shape.size() == 0)

From 272d39484cfa08a9f7716dd0cdf08783a35bcfab Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Wed, 25 Nov 2020 12:42:56 +0000
Subject: [PATCH 03/32] changes for fp16

---
 .../core/kernels/cpu/eigen/concat.cpp         |   2 +-
 .../core/kernels/cpu/eigen/convolution.cpp    |   2 +-
 src/nnfusion/core/kernels/cpu/eigen/dot.cpp   |   2 +-
 .../core/kernels/cpu/eigen/elementwise.cpp    |   2 +-
 src/nnfusion/core/kernels/cpu/eigen/lstm.cpp  |   2 +-
 .../core/kernels/cpu/eigen/max_pool.cpp       |   2 +-
 src/nnfusion/core/kernels/cpu/eigen/pad.cpp   |   2 +-
 .../core/kernels/cpu/eigen/reduce.cpp         |   2 +-
 .../core/kernels/cpu/eigen/softmax.cpp        |   2 +-
 .../core/kernels/cpu/general/anyop.cpp        |   2 +-
 .../core/kernels/cpu/general/reshape.cpp      |   2 +-
 src/nnfusion/core/kernels/cpu/mkl/dot.cpp     |   2 +-
 .../core/kernels/cpu/mlas/avg_pool.cpp        |   2 +-
 .../core/kernels/cpu/mlas/batch_matmul.cpp    |   2 +-
 .../core/kernels/cpu/mlas/convolution.cpp     |   2 +-
 src/nnfusion/core/kernels/cpu/mlas/dot.cpp    |   2 +-
 .../core/kernels/cpu/mlas/max_pool.cpp        |   2 +-
 .../kernels/cpu/reference/batch_matmul.cpp    |   2 +-
 .../core/kernels/cpu/reference/constant.cpp   |   2 +-
 .../core/kernels/cpu/reference/kernels.cpp    | 104 +++++++++---------
 .../core/kernels/cpu/reference/one_hot.cpp    |   2 +-
 .../core/kernels/cpu/reference/reduce_all.cpp |   2 +-
 .../kernels/cpu/reference/stop_gradient.cpp   |   2 +-
 .../core/kernels/cpu/reference/transpose.cpp  |   2 +-
 .../core/kernels/cpu/reference/variable.cpp   |   2 +-
 .../core/kernels/cpu/simd/elementwise.cpp     |   2 +-
 .../kernels/cpu/simd/elementwise_fused.cpp    |   2 +-
 .../core/kernels/cuda_gpu/cuda_cudnn.cpp      |  14 ++-
 .../core/kernels/cuda_gpu/cuda_cudnn.hpp      |   8 +-
 .../core/kernels/cuda_gpu/cuda_langunit.cpp   |  10 ++
 .../core/kernels/cuda_gpu/cuda_langunit.hpp   |   1 +
 .../cuda_gpu/inl/generate_kernel_code-inl.hpp |   2 +-
 .../cuda_gpu/kernels/adam_optimizer.cpp       |   2 +-
 .../core/kernels/cuda_gpu/kernels/addn.cpp    |   2 +-
 .../kernels/cuda_gpu/kernels/allreduce.cpp    |   2 +-
 .../core/kernels/cuda_gpu/kernels/anyop.cpp   |   2 +-
 .../kernels/cuda_gpu/kernels/apply_adam.cpp   |   2 +-
 .../kernels/apply_gradient_descent.cpp        |   4 +-
 .../cuda_gpu/kernels/apply_momentum.cpp       |   2 +-
 .../core/kernels/cuda_gpu/kernels/assign.cpp  |   2 +-
 .../kernels/cuda_gpu/kernels/assign_sub.cpp   |   2 +-
 .../kernels/cuda_gpu/kernels/avg_pool.cpp     |   4 +-
 .../kernels/cuda_gpu/kernels/batch_matmul.cpp |   4 +-
 .../kernels/cuda_gpu/kernels/batch_norm.cpp   |   6 +-
 .../cuda_gpu/kernels/blockfusion_fused.cpp    |   2 +-
 .../kernels/cuda_gpu/kernels/broadcast.cpp    |   4 +-
 .../core/kernels/cuda_gpu/kernels/concat.cpp  |   4 +-
 .../cuda_gpu/kernels/concat_offset.cpp        |   2 +-
 .../kernels/cuda_gpu/kernels/constant.cpp     |   2 +-
 .../kernels/cuda_gpu/kernels/convolution.cpp  |  22 +++-
 .../kernels/cuda_gpu/kernels/convolution.hpp  |   1 +
 .../crossentropy_fwdbwd_softmax_bwd_large.cpp |   2 +-
 .../cuda_gpu/kernels/depthwise_conv2d.cpp     |   2 +-
 .../core/kernels/cuda_gpu/kernels/dot.cpp     |  91 ++++++++++++++-
 .../core/kernels/cuda_gpu/kernels/dropout.cpp |   4 +-
 .../cuda_gpu/kernels/dynamic_stitch.cpp       |   2 +-
 .../kernels/cuda_gpu/kernels/elementwise.cpp  |   2 +-
 .../cuda_gpu/kernels/elementwise_fused.cpp    |   2 +-
 .../kernels/cuda_gpu/kernels/gather_1d.cpp    |   4 +-
 .../kernels/cuda_gpu/kernels/gather_nd.cpp    |   4 +-
 .../cuda_gpu/kernels/invert_permutation.cpp   |   2 +-
 .../kernels/cuda_gpu/kernels/layer_norm.cpp   |   2 +-
 .../kernels/cuda_gpu/kernels/max_pool.cpp     |   4 +-
 .../core/kernels/cuda_gpu/kernels/one_hot.cpp |   2 +-
 .../core/kernels/cuda_gpu/kernels/pad.cpp     |   4 +-
 .../core/kernels/cuda_gpu/kernels/range.cpp   |   2 +-
 .../core/kernels/cuda_gpu/kernels/reduce.cpp  |  24 ++--
 .../kernels/cuda_gpu/kernels/reduce_all.cpp   |   2 +-
 .../core/kernels/cuda_gpu/kernels/reshape.cpp |   8 +-
 .../core/kernels/cuda_gpu/kernels/result.cpp  |   2 +-
 .../core/kernels/cuda_gpu/kernels/reverse.cpp |   2 +-
 .../cuda_gpu/kernels/reverse_sequence.cpp     |   4 +-
 .../kernels/rocm/batch_gemm_fixed.cpp         |   2 +-
 .../cuda_gpu/kernels/rocm/broadcast_host.cpp  |   2 +-
 .../kernels/rocm/broadcast_kernel.cpp         |   2 +-
 .../cuda_gpu/kernels/rocm/convfwd_fixed.cpp   |   2 +-
 .../cuda_gpu/kernels/rocm/convolution.cpp     |   2 +-
 .../cuda_gpu/kernels/rocm/gemm_fixed.cpp      |   2 +-
 .../cuda_gpu/kernels/rocm/reduce_sum.cpp      |   2 +-
 .../kernels/cuda_gpu/kernels/rocm/softmax.cpp |   2 +-
 .../core/kernels/cuda_gpu/kernels/scatter.cpp |   2 +-
 .../kernels/cuda_gpu/kernels/select_node.cpp  |   2 +-
 .../core/kernels/cuda_gpu/kernels/slice.cpp   |   2 +-
 .../core/kernels/cuda_gpu/kernels/softmax.cpp |   4 +-
 .../kernels/sparse_apply_momentum.cpp         |   2 +-
 .../cuda_gpu/kernels/stop_gradient.cpp        |   2 +-
 .../cuda_gpu/kernels/strided_slice_grad.cpp   |   2 +-
 .../core/kernels/cuda_gpu/kernels/tile.cpp    |   4 +-
 .../kernels/cuda_gpu/kernels/transpose.cpp    |   2 +-
 .../cuda_gpu/kernels/unsorted_segment_sum.cpp |   2 +-
 .../kernels/cuda_gpu/kernels/variable.cpp     |   2 +-
 .../core/kernels/cuda_gpu/kernels/zeros.cpp   |   2 +-
 src/nnfusion/core/kernels/hlsl/constant.cpp   |   2 +-
 src/nnfusion/core/kernels/hlsl/parameter.cpp  |   2 +-
 src/nnfusion/core/kernels/hlsl/result.cpp     |   2 +-
 .../core/kernels/kernel_registration.cpp      |   7 +-
 .../core/kernels/kernel_registration.hpp      |   9 +-
 .../engine/pass/codegen/cuda_codegen_pass.cpp |   1 +
 .../engine/pass/graph/kernel_selection.cpp    |   6 +-
 .../frontend/tensorflow_import/ops/const.cpp  |  64 -----------
 test/main.cpp                                 |   2 +-
 101 files changed, 313 insertions(+), 259 deletions(-)

diff --git a/src/nnfusion/core/kernels/cpu/eigen/concat.cpp b/src/nnfusion/core/kernels/cpu/eigen/concat.cpp
index 89a517269..f7e3df997 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/concat.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/concat.cpp
@@ -210,5 +210,5 @@ using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
     "Concat",                                                              // op_name
-    Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), // attrs
+    Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs
     cpu::ConcatEigen)
diff --git a/src/nnfusion/core/kernels/cpu/eigen/convolution.cpp b/src/nnfusion/core/kernels/cpu/eigen/convolution.cpp
index 67cbe1dd6..8bda08757 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/convolution.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/convolution.cpp
@@ -144,5 +144,5 @@ using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
     "Convolution",                                                         // op_name
-    Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), // attrs
+    Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs
     cpu::ConvolutionEigen)
diff --git a/src/nnfusion/core/kernels/cpu/eigen/dot.cpp b/src/nnfusion/core/kernels/cpu/eigen/dot.cpp
index e3523225d..4a04d623b 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/dot.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/dot.cpp
@@ -90,5 +90,5 @@ LanguageUnit_p cpu::Dot::emit_dependency()
 
 REGISTER_KERNEL_EMITTER(
     "Dot",                                                                 // op_name
-    Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), // attrs
+    Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs
     cpu::Dot)
diff --git a/src/nnfusion/core/kernels/cpu/eigen/elementwise.cpp b/src/nnfusion/core/kernels/cpu/eigen/elementwise.cpp
index 94db71307..bfc79215f 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/elementwise.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/elementwise.cpp
@@ -8,7 +8,7 @@ using namespace nnfusion::kernels;
 
 #define REGISTER_EW_KERNEL(OP_NAME)                                                                \
     REGISTER_KERNEL_EMITTER("" #OP_NAME "",                                                        \
-                            Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), \
+                            Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), \
                             cpu::ElementwiseEigen<nnfusion::op::OP_NAME>);
 
 REGISTER_EW_KERNEL(Abs)
diff --git a/src/nnfusion/core/kernels/cpu/eigen/lstm.cpp b/src/nnfusion/core/kernels/cpu/eigen/lstm.cpp
index e29e8c456..8c3ed6046 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/lstm.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/lstm.cpp
@@ -145,7 +145,7 @@ LanguageUnit_p cpu::LstmEigen::emit_dependency()
 
 REGISTER_KERNEL_EMITTER(
     "Lstm",                                                                // op_name
-    Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), // attrs
+    Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs
     cpu::LstmEigen)
 
 void cpu::LstmEigen::emit_compute_input_helper(nnfusion::codegen::CodeWriter& lu)
diff --git a/src/nnfusion/core/kernels/cpu/eigen/max_pool.cpp b/src/nnfusion/core/kernels/cpu/eigen/max_pool.cpp
index e8ed45698..cc55948d7 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/max_pool.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/max_pool.cpp
@@ -169,5 +169,5 @@ using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
     "MaxPool",                                                             // op_name
-    Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), // attrs
+    Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs
     cpu::MaxPoolEigen)
diff --git a/src/nnfusion/core/kernels/cpu/eigen/pad.cpp b/src/nnfusion/core/kernels/cpu/eigen/pad.cpp
index 9e05125d4..aa82543a8 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/pad.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/pad.cpp
@@ -8,5 +8,5 @@ using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
     "Pad",                                                                 // op_name
-    Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), // attrs
+    Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs
     cpu::Pad<float>)                                                       // constructor
diff --git a/src/nnfusion/core/kernels/cpu/eigen/reduce.cpp b/src/nnfusion/core/kernels/cpu/eigen/reduce.cpp
index 195574a31..86065f66c 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/reduce.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/reduce.cpp
@@ -8,7 +8,7 @@ using namespace nnfusion::kernels;
 
 #define REGISTER_EW_KERNEL(OP_NAME)                                                                \
     REGISTER_KERNEL_EMITTER("" #OP_NAME "",                                                        \
-                            Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), \
+                            Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), \
                             cpu::ReduceEigen<nnfusion::op::OP_NAME>);
 
 //REGISTER_EW_KERNEL(Sum)
diff --git a/src/nnfusion/core/kernels/cpu/eigen/softmax.cpp b/src/nnfusion/core/kernels/cpu/eigen/softmax.cpp
index 45546ddda..cb7a99800 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/softmax.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/softmax.cpp
@@ -8,5 +8,5 @@ using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
     "Softmax",                                                             // op_name
-    Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("eigen").Priority(4), // attrs
+    Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs
     cpu::SoftmaxEigen<float>)                                              // constructor
diff --git a/src/nnfusion/core/kernels/cpu/general/anyop.cpp b/src/nnfusion/core/kernels/cpu/general/anyop.cpp
index 9ec7d6f06..de5620391 100644
--- a/src/nnfusion/core/kernels/cpu/general/anyop.cpp
+++ b/src/nnfusion/core/kernels/cpu/general/anyop.cpp
@@ -36,5 +36,5 @@ LanguageUnit_p cpu::AnyOP::emit_dependency()
 // Register Pad kernel emitter
 
 REGISTER_KERNEL_EMITTER("AnyOP",                                                  //op_name
-                        Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Priority(2), //attrs
+                        Device(GENERIC_CPU).TypeConstraint(element::f32).Priority(2), //attrs
                         cpu::AnyOP)                                               // constructor
diff --git a/src/nnfusion/core/kernels/cpu/general/reshape.cpp b/src/nnfusion/core/kernels/cpu/general/reshape.cpp
index a0774c5ca..f8e694d00 100644
--- a/src/nnfusion/core/kernels/cpu/general/reshape.cpp
+++ b/src/nnfusion/core/kernels/cpu/general/reshape.cpp
@@ -96,5 +96,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("Reshape", //op_name
-                        Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("cpu").Priority(2), //attrs
+                        Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("cpu").Priority(2), //attrs
                         cpu::ReshapeMemcpy) //constructor
diff --git a/src/nnfusion/core/kernels/cpu/mkl/dot.cpp b/src/nnfusion/core/kernels/cpu/mkl/dot.cpp
index cb414b767..eb4e77fa4 100644
--- a/src/nnfusion/core/kernels/cpu/mkl/dot.cpp
+++ b/src/nnfusion/core/kernels/cpu/mkl/dot.cpp
@@ -178,5 +178,5 @@ LanguageUnit_p cpu::DotMkl::emit_dependency()
 
 REGISTER_KERNEL_EMITTER(
     "Dot",                                                               // op_name
-    Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("mkl").Priority(3), // attrs
+    Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mkl").Priority(3), // attrs
     cpu::DotMkl)
diff --git a/src/nnfusion/core/kernels/cpu/mlas/avg_pool.cpp b/src/nnfusion/core/kernels/cpu/mlas/avg_pool.cpp
index ae8ca6a91..68f8c484e 100644
--- a/src/nnfusion/core/kernels/cpu/mlas/avg_pool.cpp
+++ b/src/nnfusion/core/kernels/cpu/mlas/avg_pool.cpp
@@ -109,5 +109,5 @@ LanguageUnit_p cpu::AvgPoolMlas::emit_dependency()
 
 REGISTER_KERNEL_EMITTER(
     "AvgPool",                                                            // op_name
-    Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("mlas").Priority(6), // attrs
+    Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs
     cpu::AvgPoolMlas)                                                     // constructor
diff --git a/src/nnfusion/core/kernels/cpu/mlas/batch_matmul.cpp b/src/nnfusion/core/kernels/cpu/mlas/batch_matmul.cpp
index 4e25e1bac..bcd21a959 100644
--- a/src/nnfusion/core/kernels/cpu/mlas/batch_matmul.cpp
+++ b/src/nnfusion/core/kernels/cpu/mlas/batch_matmul.cpp
@@ -115,5 +115,5 @@ LanguageUnit_p cpu::BatchMatMulMlas::emit_dependency()
 
 REGISTER_KERNEL_EMITTER(
     "BatchMatMul",                                                        // op_name
-    Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("mlas").Priority(6), // attrs
+    Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs
     cpu::BatchMatMulMlas)
diff --git a/src/nnfusion/core/kernels/cpu/mlas/convolution.cpp b/src/nnfusion/core/kernels/cpu/mlas/convolution.cpp
index ed5a9f594..48f635f5a 100644
--- a/src/nnfusion/core/kernels/cpu/mlas/convolution.cpp
+++ b/src/nnfusion/core/kernels/cpu/mlas/convolution.cpp
@@ -185,5 +185,5 @@ LanguageUnit_p cpu::ConvolutionMlas::emit_dependency()
 
 REGISTER_KERNEL_EMITTER(
     "Convolution",                                                        // op_name
-    Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("mlas").Priority(6), // attrs
+    Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs
     cpu::ConvolutionMlas)                                                 // constructor
diff --git a/src/nnfusion/core/kernels/cpu/mlas/dot.cpp b/src/nnfusion/core/kernels/cpu/mlas/dot.cpp
index 6f5fc3119..37ae88445 100644
--- a/src/nnfusion/core/kernels/cpu/mlas/dot.cpp
+++ b/src/nnfusion/core/kernels/cpu/mlas/dot.cpp
@@ -112,5 +112,5 @@ LanguageUnit_p cpu::DotMlas::emit_dependency()
 
 REGISTER_KERNEL_EMITTER(
     "Dot",                                                                // op_name
-    Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("mlas").Priority(6), // attrs
+    Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs
     cpu::DotMlas)
diff --git a/src/nnfusion/core/kernels/cpu/mlas/max_pool.cpp b/src/nnfusion/core/kernels/cpu/mlas/max_pool.cpp
index 419b8dfba..85e2bc94b 100644
--- a/src/nnfusion/core/kernels/cpu/mlas/max_pool.cpp
+++ b/src/nnfusion/core/kernels/cpu/mlas/max_pool.cpp
@@ -111,5 +111,5 @@ LanguageUnit_p cpu::MaxPoolMlas::emit_dependency()
 
 REGISTER_KERNEL_EMITTER(
     "MaxPool",                                                            // op_name
-    Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("mlas").Priority(6), // attrs
+    Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs
     cpu::MaxPoolMlas)                                                     // constructor
diff --git a/src/nnfusion/core/kernels/cpu/reference/batch_matmul.cpp b/src/nnfusion/core/kernels/cpu/reference/batch_matmul.cpp
index f8b9a0e99..bbff5a1bd 100644
--- a/src/nnfusion/core/kernels/cpu/reference/batch_matmul.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/batch_matmul.cpp
@@ -114,7 +114,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "BatchMatMul",                                                 // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 BatchMatMulRef)                                                // constructor
 
         } // namespace cpu
diff --git a/src/nnfusion/core/kernels/cpu/reference/constant.cpp b/src/nnfusion/core/kernels/cpu/reference/constant.cpp
index e6ae61aa2..e7094a52f 100644
--- a/src/nnfusion/core/kernels/cpu/reference/constant.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/constant.cpp
@@ -70,5 +70,5 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("Constant",                                   //op_name
-                        Device(GENERIC_CPU).TypeConstraint(DT_FLOAT), //attrs
+                        Device(GENERIC_CPU).TypeConstraint(element::f32), //attrs
                         cpu::Constant)                                // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cpu/reference/kernels.cpp b/src/nnfusion/core/kernels/cpu/reference/kernels.cpp
index 13755c86f..896fa385e 100644
--- a/src/nnfusion/core/kernels/cpu/reference/kernels.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/kernels.cpp
@@ -2137,7 +2137,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Abs",                                                         // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 AbsRef)                                                        // constructor
 
             class AcosRef : public KernelEmitter
@@ -2175,7 +2175,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Acos",                                                        // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 AcosRef)                                                       // constructor
 
             class AddRef : public KernelEmitter
@@ -2213,7 +2213,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Add",                                                         // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 AddRef)                                                        // constructor
 
             class AllReduceRef : public KernelEmitter
@@ -2252,7 +2252,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "AllReduce",                                                   // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 AllReduceRef)                                                  // constructor
 
             class AsinRef : public KernelEmitter
@@ -2290,7 +2290,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Asin",                                                        // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 AsinRef)                                                       // constructor
 
             class AtanRef : public KernelEmitter
@@ -2328,7 +2328,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Atan",                                                        // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 AtanRef)                                                       // constructor
 
             class BroadcastRef : public KernelEmitter
@@ -2368,7 +2368,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Broadcast",                                                   // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 BroadcastRef)                                                  // constructor
 
             class CeilingRef : public KernelEmitter
@@ -2406,7 +2406,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Ceiling",                                                     // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 CeilingRef)                                                    // constructor
 
             class ConcatRef : public KernelEmitter
@@ -2453,7 +2453,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Concat",                                                      // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 ConcatRef)                                                     // constructor
 
             /*
@@ -2489,7 +2489,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Constant",                                                    // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 ConstantRef)                                                   // constructor
             */
 
@@ -2530,7 +2530,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Convert",                                                     // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 ConvertRef)                                                    // constructor
 
             class ConvolutionRef : public KernelEmitter
@@ -2575,7 +2575,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Convolution",                                                 // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 ConvolutionRef)                                                // constructor
 
             class CosRef : public KernelEmitter
@@ -2613,7 +2613,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Cos",                                                         // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 CosRef)                                                        // constructor
 
             class CoshRef : public KernelEmitter
@@ -2651,7 +2651,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Cosh",                                                        // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 CoshRef)                                                       // constructor
 
             class DivideRef : public KernelEmitter
@@ -2689,7 +2689,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Divide",                                                      // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 DivideRef)                                                     // constructor
 
             class EqualRef : public KernelEmitter
@@ -2727,7 +2727,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Equal",                                                       // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 EqualRef)                                                      // constructor
 
             class ExpRef : public KernelEmitter
@@ -2765,7 +2765,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Exp",                                                         // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 ExpRef)                                                        // constructor
 
             class FloorRef : public KernelEmitter
@@ -2803,7 +2803,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Floor",                                                       // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 FloorRef)                                                      // constructor
 
             class GreaterRef : public KernelEmitter
@@ -2841,7 +2841,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Greater",                                                     // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 GreaterRef)                                                    // constructor
 
             class LessRef : public KernelEmitter
@@ -2879,7 +2879,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Less",                                                        // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 LessRef)                                                       // constructor
 
             class LogRef : public KernelEmitter
@@ -2917,7 +2917,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Log",                                                         // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 LogRef)                                                        // constructor
 
             class LRNRef : public KernelEmitter
@@ -2957,7 +2957,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "LRN",                                                         // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 LRNRef)                                                        // constructor
 
             class MaxRef : public KernelEmitter
@@ -2997,7 +2997,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Max",                                                         // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 MaxRef)                                                        // constructor
 
             class MaximumRef : public KernelEmitter
@@ -3035,7 +3035,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Maximum",                                                     // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 MaximumRef)                                                    // constructor
 
             class MinRef : public KernelEmitter
@@ -3075,7 +3075,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Min",                                                         // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 MinRef)                                                        // constructor
 
             class MinimumRef : public KernelEmitter
@@ -3113,7 +3113,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Minimum",                                                     // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 MinimumRef)                                                    // constructor
 
             class MultiplyRef : public KernelEmitter
@@ -3151,7 +3151,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Multiply",                                                    // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 MultiplyRef)                                                   // constructor
 
             class NegativeRef : public KernelEmitter
@@ -3189,7 +3189,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Negative",                                                    // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 NegativeRef)                                                   // constructor
 
             class PowerRef : public KernelEmitter
@@ -3227,7 +3227,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Power",                                                       // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 PowerRef)                                                      // constructor
 
             class ProductRef : public KernelEmitter
@@ -3267,7 +3267,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Product",                                                     // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 ProductRef)                                                    // constructor
 
             class ReluRef : public KernelEmitter
@@ -3305,7 +3305,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Relu",                                                        // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 ReluRef)                                                       // constructor
 
             class SelectRef : public KernelEmitter
@@ -3344,7 +3344,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Select",                                                      // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 SelectRef)                                                     // constructor
 
             class SigmoidRef : public KernelEmitter
@@ -3382,7 +3382,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Sigmoid",                                                     // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 SigmoidRef)                                                    // constructor
 
             class SignRef : public KernelEmitter
@@ -3420,7 +3420,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Sign",                                                        // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 SignRef)                                                       // constructor
 
             class SinRef : public KernelEmitter
@@ -3458,7 +3458,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Sin",                                                         // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 SinRef)                                                        // constructor
 
             class SinhRef : public KernelEmitter
@@ -3496,7 +3496,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Sinh",                                                        // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 SinhRef)                                                       // constructor
 
             class SliceRef : public KernelEmitter
@@ -3537,7 +3537,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Slice",                                                       // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 SliceRef)                                                      // constructor
 
             class SoftmaxRef : public KernelEmitter
@@ -3581,7 +3581,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Softmax",                                                     // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 SoftmaxRef)                                                    // constructor
 
             class SqrtRef : public KernelEmitter
@@ -3619,7 +3619,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Sqrt",                                                        // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 SqrtRef)                                                       // constructor
 
             class SubtractRef : public KernelEmitter
@@ -3657,7 +3657,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Subtract",                                                    // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 SubtractRef)                                                   // constructor
 
             class SumRef : public KernelEmitter
@@ -3697,7 +3697,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Sum",                                                         // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 SumRef)                                                        // constructor
 
             class TanRef : public KernelEmitter
@@ -3735,7 +3735,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Tan",                                                         // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 TanRef)                                                        // constructor
 
             class TanhRef : public KernelEmitter
@@ -3773,7 +3773,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Tanh",                                                        // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 TanhRef)                                                       // constructor
 
             class BatchNormRef : public KernelEmitter
@@ -3812,7 +3812,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "BatchNormInference",                                          // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 BatchNormRef)
 
             class AvgPoolRef : public KernelEmitter
@@ -3856,7 +3856,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "AvgPool",                                                     // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 AvgPoolRef)
 
             class DotRef : public KernelEmitter
@@ -3897,7 +3897,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Dot",                                                         // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 DotRef)
 
             class MaxPoolRef : public KernelEmitter
@@ -3940,7 +3940,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "MaxPool",                                                     // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 MaxPoolRef)
 
             class PadRef : public KernelEmitter
@@ -3982,7 +3982,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Pad",                                                         // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 PadRef)
 
             class ReshapeRef : public KernelEmitter
@@ -4022,7 +4022,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Reshape",                                                     // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 ReshapeRef)
 
             class ResultRef : public KernelEmitter
@@ -4068,7 +4068,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Result",                                                      // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 ResultRef)
 
             class LessEqRef : public KernelEmitter
@@ -4106,7 +4106,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "LessEq",                                                      // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 LessEqRef)
 
             class ReverseRef : public KernelEmitter
@@ -4146,7 +4146,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Reverse",                                                     // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 ReverseRef)
 
         } // namespace cpu
diff --git a/src/nnfusion/core/kernels/cpu/reference/one_hot.cpp b/src/nnfusion/core/kernels/cpu/reference/one_hot.cpp
index 4aeec8b82..ddaa06613 100644
--- a/src/nnfusion/core/kernels/cpu/reference/one_hot.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/one_hot.cpp
@@ -69,7 +69,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "OneHot",                                                      // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 OneHotRef)                                                     // constructor
 
         } // namespace cpu
diff --git a/src/nnfusion/core/kernels/cpu/reference/reduce_all.cpp b/src/nnfusion/core/kernels/cpu/reference/reduce_all.cpp
index 1f27075fb..20362bfad 100644
--- a/src/nnfusion/core/kernels/cpu/reference/reduce_all.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/reduce_all.cpp
@@ -62,7 +62,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "All",                                                         // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 AllRef)                                                        // constructor
 
         } // namespace cpu
diff --git a/src/nnfusion/core/kernels/cpu/reference/stop_gradient.cpp b/src/nnfusion/core/kernels/cpu/reference/stop_gradient.cpp
index 3bb049f38..531cea738 100644
--- a/src/nnfusion/core/kernels/cpu/reference/stop_gradient.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/stop_gradient.cpp
@@ -55,7 +55,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "StopGradient",                                                // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 StopGradientRef)                                               // constructor
 
         } // namespace cpu
diff --git a/src/nnfusion/core/kernels/cpu/reference/transpose.cpp b/src/nnfusion/core/kernels/cpu/reference/transpose.cpp
index e5b23869e..4b9508899 100644
--- a/src/nnfusion/core/kernels/cpu/reference/transpose.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/transpose.cpp
@@ -114,7 +114,7 @@ namespace nnfusion
 
             REGISTER_KERNEL_EMITTER(
                 "Transpose",                                                   // op_name
-                Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("reference"), // attrs
+                Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 TransposeRef)                                                  // constructor
 
         } // namespace cpu
diff --git a/src/nnfusion/core/kernels/cpu/reference/variable.cpp b/src/nnfusion/core/kernels/cpu/reference/variable.cpp
index 8ea594d1c..2b8bf9c0c 100644
--- a/src/nnfusion/core/kernels/cpu/reference/variable.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/variable.cpp
@@ -68,5 +68,5 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("Variable",                                   //op_name
-                        Device(GENERIC_CPU).TypeConstraint(DT_FLOAT), //attrs
+                        Device(GENERIC_CPU).TypeConstraint(element::f32), //attrs
                         cpu::Variable)                                // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cpu/simd/elementwise.cpp b/src/nnfusion/core/kernels/cpu/simd/elementwise.cpp
index 02dadaeef..cd725f542 100644
--- a/src/nnfusion/core/kernels/cpu/simd/elementwise.cpp
+++ b/src/nnfusion/core/kernels/cpu/simd/elementwise.cpp
@@ -8,7 +8,7 @@ using namespace nnfusion::kernels;
 
 #define REGISTER_EW_KERNEL(OP_NAME)                                                                \
     REGISTER_KERNEL_EMITTER("" #OP_NAME "",                                                        \
-                            Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("simd").Priority(5),  \
+                            Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("simd").Priority(5),  \
                             cpu::ElementwiseSimd<nnfusion::op::OP_NAME>);
 
 REGISTER_EW_KERNEL(Abs)
diff --git a/src/nnfusion/core/kernels/cpu/simd/elementwise_fused.cpp b/src/nnfusion/core/kernels/cpu/simd/elementwise_fused.cpp
index 6643e9b23..1423244bf 100644
--- a/src/nnfusion/core/kernels/cpu/simd/elementwise_fused.cpp
+++ b/src/nnfusion/core/kernels/cpu/simd/elementwise_fused.cpp
@@ -439,5 +439,5 @@ LanguageUnit_p ElementwiseFused::emit_comments()
 
 REGISTER_KERNEL_EMITTER(
     "ElementwiseFused",                                                   // op_name
-    Device(GENERIC_CPU).TypeConstraint(DT_FLOAT).Tag("simd").Priority(5), // attrs
+    Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("simd").Priority(5), // attrs
     cpu::ElementwiseFused)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp
index 7d21aead2..b0ebd6c8b 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp
@@ -20,6 +20,7 @@ std::vector<int> cuda::compute_strides(const std::vector<int>& shape)
 std::string cuda::get_cudnn_datatype(std::string dtype)
 {
     static const std::unordered_map<std::string, std::string> datatype_map{
+        {"half", "CUDNN_DATA_HALF"},
         {"float", "CUDNN_DATA_FLOAT"},
         {"double", "CUDNN_DATA_DOUBLE"},
         {"int8_t", "CUDNN_DATA_INT8"},
@@ -30,11 +31,11 @@ std::string cuda::get_cudnn_datatype(std::string dtype)
     return p->second;
 }
 
-LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape, string desc)
+LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape, string desc, string type)
 {
     LanguageUnit_p _lu(new LanguageUnit);
     auto& lu = *_lu;
-    string data_type = "CUDNN_DATA_FLOAT"; //cuda::get_cudnn_datatype(type);
+    string data_type = cuda::get_cudnn_datatype(type);
     string tensor_format = "CUDNN_TENSOR_NCHW";
     lu << "cudnnTensorDescriptor_t " << desc << ";\n";
     lu << "CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&" << desc << "));\n";
@@ -91,12 +92,12 @@ LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& s
     return _lu;
 }
 
-LanguageUnit_p cuda::get_cudnn_filter_descriptor(const Shape& shape, string desc)
+LanguageUnit_p cuda::get_cudnn_filter_descriptor(const Shape& shape, string desc, string type)
 {
     LanguageUnit_p _lu(new LanguageUnit);
     auto& lu = *_lu;
 
-    string data_type = "CUDNN_DATA_FLOAT"; //cuda::get_cudnn_datatype(type);
+    string data_type = cuda::get_cudnn_datatype(type);
     string tensor_format = "CUDNN_TENSOR_NCHW";
     lu << "cudnnFilterDescriptor_t " << desc << ";\n";
     lu << "CUDNN_SAFE_CALL(cudnnCreateFilterDescriptor(&" << desc << "));\n";
@@ -143,12 +144,13 @@ LanguageUnit_p cuda::get_cudnn_filter_descriptor(const Shape& shape, string desc
 LanguageUnit_p cuda::get_cudnn_convolution_descriptor(const Shape& padding,
                                                       const Strides& window_movement_strides,
                                                       const Strides& window_dilation_strides,
-                                                      string desc)
+                                                      string desc,
+                                                      string type)
 {
     LanguageUnit_p _lu(new LanguageUnit);
     auto& lu = *_lu;
 
-    string data_type = "CUDNN_DATA_FLOAT"; //cuda::get_cudnn_datatype(type);
+    string data_type = cuda::get_cudnn_datatype(type);
     string tensor_format = "CUDNN_TENSOR_NCHW";
     lu << "cudnnConvolutionDescriptor_t " << desc << ";\n";
     lu << "CUDNN_SAFE_CALL(cudnnCreateConvolutionDescriptor(&" << desc << "));\n";
diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp
index b127e5aa5..63d63ce66 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp
@@ -15,12 +15,14 @@ namespace nnfusion
             std::vector<int> compute_strides(const std::vector<int>& shape);
             std::string get_cudnn_datatype(std::string dtype);
             LanguageUnit_p cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape,
-                                                              string desc);
+                                                              string desc,
+                                                              string type = "float");
             LanguageUnit_p get_cudnn_convolution_descriptor(const Shape& padding,
                                                             const Strides& window_movement_strides,
                                                             const Strides& window_dilation_strides,
-                                                            string desc);
-            LanguageUnit_p get_cudnn_filter_descriptor(const Shape& shape, string desc);
+                                                            string desc,
+                                                            string type = "float");
+            LanguageUnit_p get_cudnn_filter_descriptor(const Shape& shape, string desc, string type = "float");
             LanguageUnit_p get_dropout_global_states(float ratio);
             inline std::string ratio2str(float ratio)
             {
diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
index 6d21791ab..62f1bf6cc 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
@@ -14,6 +14,7 @@ LU_DEFINE(header::cudnn, "#include <cudnn.h>\n");
 LU_DEFINE(header::super_scaler, "#include \"super_scaler.h\"\n");
 LU_DEFINE(header::cupti, "#include <cupti.h>\n");
 LU_DEFINE(header::cuda_prof_api, "#include <cuda_profiler_api.h>\n");
+LU_DEFINE(header::cuda_fp16, "#include <cuda_fp16.h>");
 
 // Macro
 LU_DEFINE(
@@ -223,6 +224,15 @@ __device__ __forceinline__ float  load(const float*  __restrict__ in, int i=0, b
     }
     return v;
 }
+__device__ __forceinline__ half  load(const half*  __restrict__ in, int i=0, bool b=true)
+{
+    half v = 0.0f;
+    if (b)
+    {
+        v = __ldg(in + i);
+    }
+    return v;
+}
 __device__ __forceinline__ int32_t  load(const int32_t*  __restrict__ in, int i=0, bool b=true)
 {
     int32_t v = 0;
diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp
index 48ab0eb3c..ecaeb6c89 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp
@@ -16,6 +16,7 @@ namespace nnfusion
             LU_DECLARE(super_scaler);
             LU_DECLARE(cupti);
             LU_DECLARE(cuda_prof_api);
+            LU_DECLARE(cuda_fp16);
         } // namespace header
 
         namespace macro
diff --git a/src/nnfusion/core/kernels/cuda_gpu/inl/generate_kernel_code-inl.hpp b/src/nnfusion/core/kernels/cuda_gpu/inl/generate_kernel_code-inl.hpp
index c97f329fa..9ef0d123c 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/inl/generate_kernel_code-inl.hpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/inl/generate_kernel_code-inl.hpp
@@ -65,5 +65,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(__KernelOpType__,                                             // op_name
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel"), // attrs
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), // attrs
                         cuda::__KernelUniqueClassName__)                              // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/adam_optimizer.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/adam_optimizer.cpp
index 7cb3efed8..8008f9d62 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/adam_optimizer.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/adam_optimizer.cpp
@@ -113,5 +113,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("AdamOptimizer",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel"),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"),
                         cuda::AdamOptimizer)
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/addn.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/addn.cpp
index 90a5fa94f..b19c6d2ae 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/addn.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/addn.cpp
@@ -90,5 +90,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("AddN",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
                         cuda::AddN)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/allreduce.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/allreduce.cpp
index 9e5ccc8f5..525968ab2 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/allreduce.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/allreduce.cpp
@@ -50,5 +50,5 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("AllReduce",                                           //op_name
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Priority(2), //attrs
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs
                         cuda::SuperScalerAllReduce)                            // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/anyop.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/anyop.cpp
index 29de6ccb7..1adc0952c 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/anyop.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/anyop.cpp
@@ -35,5 +35,5 @@ LanguageUnit_p cuda::AnyOP::emit_dependency()
 // Register Pad kernel emitter
 
 REGISTER_KERNEL_EMITTER("AnyOP",                                               //op_name
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Priority(2), //attrs
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs
                         cuda::AnyOP)                                           // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp
index 0ad0fee4e..566e1b535 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp
@@ -106,5 +106,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("ApplyAdam",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
                         cuda::ApplyAdam)
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_gradient_descent.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_gradient_descent.cpp
index 6dbb94934..05dcc1087 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_gradient_descent.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_gradient_descent.cpp
@@ -76,8 +76,8 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("ApplyGradient",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
                         cuda::ApplyGradientDescent)
 REGISTER_KERNEL_EMITTER("ApplyGradientDescent",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
                         cuda::ApplyGradientDescent)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_momentum.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_momentum.cpp
index 7d3e76f74..72716bd4e 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_momentum.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_momentum.cpp
@@ -80,5 +80,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("ApplyMomentum",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
                         cuda::ApplyMomentum)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/assign.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/assign.cpp
index 867ef132b..f22ba886b 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/assign.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/assign.cpp
@@ -76,5 +76,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("Assign",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
                         cuda::Assign)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/assign_sub.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/assign_sub.cpp
index 00bfb66ec..d21de903d 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/assign_sub.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/assign_sub.cpp
@@ -76,5 +76,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("AssignSub",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
                         cuda::AssignSub)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp
index 81be99d8b..bc2c56b1b 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp
@@ -405,10 +405,10 @@ LanguageUnit_p cuda::AvgPoolmD::emit_function_signature()
 
 REGISTER_KERNEL_EMITTER(
     "AvgPool",                                                                // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::AvgPool1D)                                                          // constructor
 
 REGISTER_KERNEL_EMITTER(
     "AvgPool",                                                                 // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs
     cuda::AvgPoolmD)                                                           // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp
index bcec70235..ad81ec3d8 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp
@@ -193,10 +193,10 @@ using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
     "BatchMatMul",                                                            // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::BatchMatMul)                                                        // constructor
 
 REGISTER_KERNEL_EMITTER(
     "BatchMatMul",                                                            // op_name
-    Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::BatchMatMul)                                                        // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp
index 352deab10..6a64bee78 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp
@@ -202,11 +202,11 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("BatchNormInference", // op_name
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn").Priority(2), // attrs
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn").Priority(2), // attrs
                         cuda::BatchNorm)      // constructor
 REGISTER_KERNEL_EMITTER("BatchNormInference", // op_name
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda").Priority(2), // attrs
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda").Priority(2), // attrs
                         cuda::BatchNormNCHW)  // constructor
 REGISTER_KERNEL_EMITTER("BatchNormInference", // op_name
-                        Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cuda").Priority(2), // attrs
+                        Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda").Priority(2), // attrs
                         cuda::BatchNormNCHW) // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/blockfusion_fused.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/blockfusion_fused.cpp
index 2df9329d8..22cea1999 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/blockfusion_fused.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/blockfusion_fused.cpp
@@ -72,5 +72,5 @@ void BlockFusionFused::set_launch_config()
 
 REGISTER_KERNEL_EMITTER(
     "BlockFusionFused",                                                       // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::BlockFusionFused)
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/broadcast.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/broadcast.cpp
index 060004fb1..08660ed41 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/broadcast.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/broadcast.cpp
@@ -195,9 +195,9 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("Broadcast",                                           //op_name
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Priority(2), //attrs
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs
                         cuda::Broadcast)                                       // constructor
 
 REGISTER_KERNEL_EMITTER("Broadcast",                                           //op_name
-                        Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Priority(2), //attrs
+                        Device(ROCM_GPU).TypeConstraint(element::f32).Priority(2), //attrs
                         cuda::RocmBroadcast)                                   // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/concat.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/concat.cpp
index dc06709fd..3b39408e0 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/concat.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/concat.cpp
@@ -325,7 +325,7 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("Concat",                                  //op_name
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT), //attrs
+                        Device(CUDA_GPU).TypeConstraint(element::f32), //attrs
                         cuda::Concat)                              // constructor
 
 namespace nnfusion
@@ -480,5 +480,5 @@ namespace nnfusion
 } // namespace nnfusion
 
 REGISTER_KERNEL_EMITTER("Concat",                                  //op_name
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT), //attrs
+                        Device(CUDA_GPU).TypeConstraint(element::f32), //attrs
                         cuda::ConcatKernel)                        // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/concat_offset.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/concat_offset.cpp
index 0a4af44c9..586091381 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/concat_offset.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/concat_offset.cpp
@@ -74,5 +74,5 @@ using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
     "ConcatOffset",                                                           // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::ConcatOffset)                                                       // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp
index e1f1d5d14..d6e0d9000 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp
@@ -120,5 +120,5 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("Constant",                                            //op_name
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Priority(2), //attrs
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs
                         cuda::Constant)                                        // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp
index a8053d74d..7ddcb0426 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp
@@ -12,6 +12,12 @@ cuda::ConvolutionCudnn::ConvolutionCudnn(shared_ptr<KernelContext> ctx)
 {
     auto conv = static_pointer_cast<nnfusion::op::Convolution>(ctx->gnode->get_op_ptr());
 
+    input_type = ctx->inputs[0]->get_element_type();
+    filter_type = ctx->inputs[1]->get_element_type();
+    output_type = ctx->outputs[0]->get_element_type();
+    NNFUSION_CHECK(input_type == filter_type && input_type == output_type) 
+        << "Convolution input datatype (" << input_type << ") should be the same with that of filter (" << filter_type << "), and that of output (" << output_type << ").";
+    conv_type = input_type;
     input_shape = ctx->inputs[0]->get_shape();
     filter_shape = ctx->inputs[1]->get_shape();
     output_shape = ctx->outputs[0]->get_shape();
@@ -79,13 +85,14 @@ LanguageUnit_p cuda::ConvolutionCudnn::emit_function_body()
         padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
     }
 
+
     {
         // lu << "cudnnDataType_t data_type = " << get_cudnn_datatype(dtype) << ";\n";
-        lu << cudnn_tensor_descriptor_from_shape(input_shape, "tensor_desc_0")->get_code();
-        lu << cudnn_tensor_descriptor_from_shape(output_shape, "tensor_desc_1")->get_code();
-        lu << get_cudnn_filter_descriptor(filter_shape, "filter_desc")->get_code();
+        lu << cudnn_tensor_descriptor_from_shape(input_shape, "tensor_desc_0", input_type)->get_code();
+        lu << cudnn_tensor_descriptor_from_shape(output_shape, "tensor_desc_1", output_type)->get_code();
+        lu << get_cudnn_filter_descriptor(filter_shape, "filter_desc", filter_type)->get_code();
         lu << get_cudnn_convolution_descriptor(
-                  padding_below, window_movement_strides, window_dilation_strides, "conv_desc")
+                  padding_below, window_movement_strides, window_dilation_strides, "conv_desc", conv_type)
                   ->get_code();
 
         lu << R"(
@@ -207,5 +214,10 @@ LanguageUnit_p cuda::ConvolutionCudnn::emit_function_signature()
 
 REGISTER_KERNEL_EMITTER(
     "Convolution",                                                             // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs
+    cuda::ConvolutionCudnn)                                                    // constructor
+
+REGISTER_KERNEL_EMITTER(
+    "Convolution",                                                             // op_name
+    Device(CUDA_GPU).TypeConstraint(element::f16).Tag("cudnn_kernel").Priority(2), // attrs
     cuda::ConvolutionCudnn)                                                    // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.hpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.hpp
index 654cae2ca..444f2743a 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.hpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.hpp
@@ -22,6 +22,7 @@ namespace nnfusion
                 bool require_cudnn_handle() override { return true; }
             private:
                 nnfusion::Shape input_shape, filter_shape, output_shape;
+                element::Type input_type, filter_type, output_type, conv_type;
                 nnfusion::Strides window_dilation_strides, window_movement_strides,
                     data_dilation_strides;
                 nnfusion::CoordinateDiff padding_below_diff, padding_above_diff;
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/crossentropy_fwdbwd_softmax_bwd_large.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/crossentropy_fwdbwd_softmax_bwd_large.cpp
index 8eadc9d1a..b8bf9699f 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/crossentropy_fwdbwd_softmax_bwd_large.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/crossentropy_fwdbwd_softmax_bwd_large.cpp
@@ -78,5 +78,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("CrossEntropyFwdBwdWithSoftmaxBwdLarge",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel"),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"),
                         cuda::CrossEntropyFwdBwdWithSoftmaxBwdLarge)
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp
index 636c8ba2b..2b1dce365 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp
@@ -373,5 +373,5 @@ LanguageUnit_p cuda::DepthwiseConv2dNative::emit_dependency()
 
 REGISTER_KERNEL_EMITTER(
     "DepthwiseConv2dNative",                                                  // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::DepthwiseConv2dNative)                                              // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
index 973204f87..7b7a3d606 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
@@ -30,6 +30,7 @@ LanguageUnit_p cuda::Dot::emit_function_body()
     auto gemm = static_pointer_cast<nnfusion::op::Dot>(ctx->gnode->get_op_ptr());
     auto trans_A = gemm->get_transpose_A();
     auto trans_B = gemm->get_transpose_B();
+    auto dtype = ctx->outputs[0]->get_element_type();
 
     LanguageUnit_p _lu(new LanguageUnit(get_function_name()));
     auto& lu = *_lu;
@@ -38,6 +39,7 @@ LanguageUnit_p cuda::Dot::emit_function_body()
     // void kernel(m_context->dtypes[0]* input0, m_context->dtypes[0]* input1, m_context->dtypes[2]* output0)
 
     //lu.block_begin();
+    if (dtype == element::f32)
     {
         // case 1: Scalar * Tensor
         if (arg0_shape.empty() || arg1_shape.empty())
@@ -201,6 +203,86 @@ LanguageUnit_p cuda::Dot::emit_function_body()
                << " static_cast<float*>(output0),"
                << " " << n << "));\n";
         }
+    } else if (dtype == element::f16) {
+        size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
+        size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
+        size_t axes_for_k_count = reduction_axes;
+        size_t m = 1;
+        size_t n = 1;
+        size_t k = 1;
+
+        // check if input and output size correct
+        // check and calculate k for arg0 and arg1
+        size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
+        size_t arg1_k_idx = 0;                // first axe in arg1 for k
+
+        for (size_t i = 0; i < axes_for_k_count; i++)
+        {
+            k *= arg0_shape[arg0_k_idx];
+            if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
+            {
+                std::vector<std::string> arg_vec{"arg0", "arg1"};
+                std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
+
+                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                        << nnfusion::join(shape_vec) << " respectively, at Node "
+                                        << m_context->gnode->get_name()
+                                        << ", do not match for dot op";
+            }
+        }
+        // check and calculate m for arg0 and out
+        size_t arg0_m_idx = 0; // first axe in arg0 for m
+        size_t out_m_idx = 0;  // first axe in out for m
+        for (size_t i = 0; i < axes_for_m_count; i++)
+        {
+            m *= arg0_shape[arg0_m_idx];
+            if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
+            {
+                std::vector<std::string> arg_vec{"arg0", "output"};
+                std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
+
+                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                        << nnfusion::join(shape_vec) << " respectively, at Node "
+                                        << m_context->gnode->get_name()
+                                        << ", do not match for dot op";
+            }
+        }
+        // check and calculate n for arg1 and out
+        size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
+        size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
+        for (size_t i = 0; i < axes_for_n_count; i++)
+        {
+            n *= arg1_shape[arg1_n_idx];
+            if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
+            {
+                std::vector<std::string> arg_vec{"arg1", "output"};
+                std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
+
+                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                        << nnfusion::join(shape_vec) << " respectively, at Node "
+                                        << m_context->gnode->get_name()
+                                        << ", do not match for dot op";
+            }
+        }
+
+        lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
+
+        lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
+            << " CUBLAS_OP_N,"
+            << " CUBLAS_OP_N,"
+            << " " << n << ","
+            << " " << m << ","
+            << " " << k << ","
+            << " &alpha,"
+            << " static_cast<const half*>(input1),"
+            << " " << n << ","
+            << " static_cast<const half*>(input0),"
+            << " " << k << ","
+            << " &beta,"
+            << " static_cast<half*>(output0),"
+            << " " << n << "));\n";
+    } else {
+        NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot."
     }
     //lu.block_end();
     return _lu;
@@ -256,10 +338,15 @@ LanguageUnit_p cuda::Dot::emit_function_signature()
 
 REGISTER_KERNEL_EMITTER(
     "Dot",                                                               // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cublas").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs
+    cuda::Dot)                                                           // constructor
+
+REGISTER_KERNEL_EMITTER(
+    "Dot",                                                               // op_name
+    Device(CUDA_GPU).TypeConstraint(element::f16).Tag("cublas").Priority(2), // attrs
     cuda::Dot)                                                           // constructor
 
 REGISTER_KERNEL_EMITTER(
     "Dot",                                                               // op_name
-    Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cublas").Priority(2), // attrs
+    Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs
     cuda::Dot)                                                           // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dropout.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dropout.cpp
index b827eef70..25b4c51ce 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dropout.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dropout.cpp
@@ -254,9 +254,9 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("DropoutTraining",                                      // op_name
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn"), // attrs
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn"), // attrs
                         cuda::DropoutTraining)                                  // constructor
 
 REGISTER_KERNEL_EMITTER("DropoutTrainingGrad",                                  // op_name
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn"), // attrs
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn"), // attrs
                         cuda::DropoutTrainingGrad)                              // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp
index b4caa5e3c..96385a2f2 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp
@@ -123,5 +123,5 @@ LanguageUnit_p cuda::DynamicStitch::emit_dependency()
 
 REGISTER_KERNEL_EMITTER(
     "DynamicStitch",                                                          // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::DynamicStitch)                                                      // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.cpp
index 949bc78a2..9597d122e 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.cpp
@@ -9,7 +9,7 @@ using namespace nnfusion::kernels;
 #define REGISTER_EW_KERNEL(OP_NAME)                                                                \
     REGISTER_KERNEL_EMITTER(                                                                       \
         "" #OP_NAME "",                                                                            \
-        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("element_wise").Priority(2),                 \
+        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("element_wise").Priority(2),                 \
         cuda::ElementWise<nnfusion::op::OP_NAME>);
 
 REGISTER_EW_KERNEL(Abs)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp
index d8a104fa4..c4c3b0bdc 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp
@@ -341,5 +341,5 @@ void ElementWiseFused::compute_best_config(int& grids, int& blocks, int& bound)
 
 REGISTER_KERNEL_EMITTER(
     "ElementWiseFused",                                                       // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::ElementWiseFused)
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_1d.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_1d.cpp
index 0bbaa8366..94fa0e506 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_1d.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_1d.cpp
@@ -116,7 +116,7 @@ LanguageUnit_p cuda::Gather1D::emit_dependency()
 
 REGISTER_KERNEL_EMITTER(
     "GatherV2",                                                               // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::Gather1D)                                                           // constructor
 
 cuda::Gather1DGrad::Gather1DGrad(shared_ptr<KernelContext> ctx)
@@ -229,5 +229,5 @@ LanguageUnit_p cuda::Gather1DGrad::emit_dependency()
 
 REGISTER_KERNEL_EMITTER(
     "GatherGrad",                                                             // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::Gather1DGrad)                                                       // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_nd.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_nd.cpp
index 70be110e2..6743691cf 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_nd.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_nd.cpp
@@ -279,9 +279,9 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("GatherND",                                                   // op_name
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel"), // attrs
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), // attrs
                         cuda::GatherND)                                               // constructor
 
 REGISTER_KERNEL_EMITTER("GatherNDGrad",                                               // op_name
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel"), // attrs
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), // attrs
                         cuda::GatherNDGrad)                                           // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/invert_permutation.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/invert_permutation.cpp
index 9e119a500..6dd9bba2e 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/invert_permutation.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/invert_permutation.cpp
@@ -65,6 +65,6 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("InvertPermutation",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Priority(
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Priority(
                             2), // TODO: this op input and output will all be int
                         cuda::InvertPermutation)
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/layer_norm.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/layer_norm.cpp
index 7a44d730a..bc9a6f463 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/layer_norm.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/layer_norm.cpp
@@ -78,5 +78,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("LayerNorm",                                              // op_name
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudalib"), // attrs
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudalib"), // attrs
                         cuda::LayerNorm)                                          // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp
index 49be708e2..ed603d218 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp
@@ -241,10 +241,10 @@ LanguageUnit_p cuda::MaxPoolmD::emit_function_signature()
 
 REGISTER_KERNEL_EMITTER(
     "MaxPool",                                                                // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::MaxPool1D)                                                          // constructor
 
 REGISTER_KERNEL_EMITTER(
     "MaxPool",                                                                 // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs
     cuda::MaxPoolmD)                                                           // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/one_hot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/one_hot.cpp
index e0fea7288..1368e1244 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/one_hot.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/one_hot.cpp
@@ -110,5 +110,5 @@ using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
     "OneHot",                                                                 // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::OneHot)                                                             // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp
index 68434d486..0ac7149dc 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp
@@ -138,7 +138,7 @@ KernelRegistrar kernel_registrar0(
     "Pad",
     Name("Pad")
         .Device(CUDA_GPU)
-        .TypeConstraint(DT_FLOAT)
+        .TypeConstraint(element::f32)
         .Tag("cuda_kernel")
         .KernelFactory([](shared_ptr<KernelContext> context) -> shared_ptr<KernelEmitter> {
             return make_shared<cuda::Pad>(context);
@@ -148,5 +148,5 @@ KernelRegistrar kernel_registrar0(
 
 REGISTER_KERNEL_EMITTER(
     "Pad",                                                                    // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::Pad)                                                                // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp
index 4eb1d090c..397eeb9b4 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp
@@ -64,5 +64,5 @@ LanguageUnit_p cuda::Range::emit_dependency()
 }
 REGISTER_KERNEL_EMITTER(
     "Range",                                                                  // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::Range)                                                              // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.cpp
index ae0b6097c..15dd2d3ce 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.cpp
@@ -7,55 +7,55 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("Max",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
                         cuda::Reduce<nnfusion::op::Max>)
 
 REGISTER_KERNEL_EMITTER(
     "Max",                                                                 // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_lib").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
     cuda::ReduceMemcpy<nnfusion::op::Max>)
 
 REGISTER_KERNEL_EMITTER("Min",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
                         cuda::Reduce<nnfusion::op::Min>)
 
 REGISTER_KERNEL_EMITTER(
     "Min",                                                                 // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_lib").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
     cuda::ReduceMemcpy<nnfusion::op::Min>)
 
 REGISTER_KERNEL_EMITTER("Product",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
                         cuda::Reduce<nnfusion::op::Multiply>)
 
 REGISTER_KERNEL_EMITTER(
     "Product",                                                             // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_lib").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
     cuda::ReduceMemcpy<nnfusion::op::Multiply>)
 
 REGISTER_KERNEL_EMITTER("Sum",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
                         cuda::Reduce<nnfusion::op::Add>)
 
 REGISTER_KERNEL_EMITTER(
     "Sum",                                                                 // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_lib").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
     cuda::ReduceMemcpy<nnfusion::op::Add>)
 
 REGISTER_KERNEL_EMITTER("Sum",
-                        Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2),
+                        Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
                         cuda::Reduce<nnfusion::op::Add>)
 
 REGISTER_KERNEL_EMITTER(
     "Sum",                                                                 // op_name
-    Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_lib").Priority(2), // attrs
+    Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
     cuda::ReduceMemcpy<nnfusion::op::Add>)
 
 REGISTER_KERNEL_EMITTER("ReduceAny",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
                         cuda::Reduce<nnfusion::op::Or>)
 
 REGISTER_KERNEL_EMITTER(
     "ReduceAny",                                                           // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_lib").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
     cuda::ReduceMemcpy<nnfusion::op::Or>)
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce_all.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce_all.cpp
index 2c94e1335..6760325ec 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce_all.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce_all.cpp
@@ -110,5 +110,5 @@ using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
     "All",                                                                    // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::All)                                                                // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reshape.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reshape.cpp
index caf0b0f17..168a91011 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reshape.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reshape.cpp
@@ -558,20 +558,20 @@ LanguageUnit_p cuda::ReshapeMemcpy::emit_function_signature()
 
 REGISTER_KERNEL_EMITTER(
     "Reshape",                                                                   // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel_2D").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel_2D").Priority(2), // attrs
     cuda::Reshape2D)                                                             // constructor
 
 REGISTER_KERNEL_EMITTER(
     "Reshape",                                                                   // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel_3D").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel_3D").Priority(2), // attrs
     cuda::Reshape3D)                                                             // constructor
 
 REGISTER_KERNEL_EMITTER(
     "Reshape",                                                                  // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel_D").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel_D").Priority(2), // attrs
     cuda::ReshapehD)                                                            // constructor
 
 REGISTER_KERNEL_EMITTER(
     "Reshape",                                                             // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_lib").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
     cuda::ReshapeMemcpy)                                                   // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp
index 61d57f210..46e81ade7 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp
@@ -85,5 +85,5 @@ LanguageUnit_p cuda::Result::emit_dependency()
 
 REGISTER_KERNEL_EMITTER(
     "Result",                                                              // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_lib").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
     cuda::Result)                                                          // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp
index 56df9f1de..36f2d39b5 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp
@@ -102,5 +102,5 @@ LanguageUnit_p cuda::Reverse::emit_dependency()
 
 REGISTER_KERNEL_EMITTER(
     "Reverse",                                                                // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::Reverse)                                                            // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp
index dc14f153b..6f0e0e2bc 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp
@@ -125,9 +125,9 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER(
     "ReverseSequence",                                                        // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::ReverseSequence)                                                    // constructor
 
 REGISTER_KERNEL_EMITTER("ReverseSequence",                                     // op_name
-                        Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Priority(2), // attrs
+                        Device(ROCM_GPU).TypeConstraint(element::f32).Priority(2), // attrs
                         cuda::RocmReverseSequence)                             // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/batch_gemm_fixed.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/batch_gemm_fixed.cpp
index 4718b9127..5ed2d8ea4 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/batch_gemm_fixed.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/batch_gemm_fixed.cpp
@@ -113,5 +113,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("BatchMatMul",                                         // op_name
-                        Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Priority(4), // attrs
+                        Device(ROCM_GPU).TypeConstraint(element::f32).Priority(4), // attrs
                         cuda::BatchGemmFixed)                                  // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_host.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_host.cpp
index 9cc765ba2..63dd091e4 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_host.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_host.cpp
@@ -173,5 +173,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("Broadcast",                                           //op_name
-                        Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Priority(3), //attrs
+                        Device(ROCM_GPU).TypeConstraint(element::f32).Priority(3), //attrs
                         cuda::RocmBiasBroadcast)                               // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_kernel.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_kernel.cpp
index b88ce8ab3..f5f8b3a50 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_kernel.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_kernel.cpp
@@ -296,5 +296,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("Broadcast",                                           // op_name
-                        Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Priority(4), // attrs
+                        Device(ROCM_GPU).TypeConstraint(element::f32).Priority(4), // attrs
                         cuda::RocmManualBroadcast)                             // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convfwd_fixed.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convfwd_fixed.cpp
index c9b54b46f..04fb041b8 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convfwd_fixed.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convfwd_fixed.cpp
@@ -142,5 +142,5 @@ using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
     "Convolution",                                                            // op_name
-    Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::ConvFwdFixed)                                                       // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convolution.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convolution.cpp
index c8920754e..2b19b8db0 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convolution.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convolution.cpp
@@ -227,5 +227,5 @@ using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
     "Convolution",                                                             // op_name
-    Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn_kernel").Priority(2), // attrs
+    Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs
     cuda::RocmConvolutionCudnn)                                                // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/gemm_fixed.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/gemm_fixed.cpp
index 15cfecdc5..983b7fce3 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/gemm_fixed.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/gemm_fixed.cpp
@@ -208,5 +208,5 @@ using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
     "Dot",                                                                    // op_name
-    Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::GemmFixed)                                                          // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/reduce_sum.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/reduce_sum.cpp
index af26b29a9..a476ddd06 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/reduce_sum.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/reduce_sum.cpp
@@ -335,7 +335,7 @@ using namespace nnfusion::kernels;
 
 #define REGISTER_GPU_KERNEL(KEY, OP_NAME)                                                          \
     REGISTER_KERNEL_EMITTER(KEY,                                                                   \
-                            Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Priority(4),                 \
+                            Device(ROCM_GPU).TypeConstraint(element::f32).Priority(4),                 \
                             cuda::RocmReduce<nnfusion::op::OP_NAME>)
 
 REGISTER_GPU_KERNEL("Sum", Add)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/softmax.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/softmax.cpp
index b68c94011..a2a54782d 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/softmax.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/softmax.cpp
@@ -135,5 +135,5 @@ using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
     "Softmax",                                                                // op_name
-    Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::RocmSoftmax)                                                        // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/scatter.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/scatter.cpp
index 29a0d2090..fb2fd5930 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/scatter.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/scatter.cpp
@@ -98,7 +98,7 @@ using namespace nnfusion::kernels;
 
 #define REGISTER_SCATTER_KERNEL(OP_NAME, KERNEL_NAME)                                              \
     REGISTER_KERNEL_EMITTER("" #KERNEL_NAME "",                                                    \
-                            Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("scatter").Priority(2),  \
+                            Device(CUDA_GPU).TypeConstraint(element::f32).Tag("scatter").Priority(2),  \
                             cuda::Scatter<nnfusion::op::OP_NAME>);
 
 REGISTER_SCATTER_KERNEL(Subtract, ScatterSub);
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/select_node.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/select_node.cpp
index 33cc8bed3..c55ac7a3f 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/select_node.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/select_node.cpp
@@ -72,5 +72,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("SelectNode",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel"),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"),
                         cuda::SelectNode)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/slice.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/slice.cpp
index 9bb7e9507..d5bb5910d 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/slice.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/slice.cpp
@@ -139,5 +139,5 @@ LanguageUnit_p cuda::Slice::emit_dependency()
 
 REGISTER_KERNEL_EMITTER(
     "Slice",                                                                  // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::Slice)                                                              // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp
index 16287054b..0dba0aec4 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp
@@ -133,7 +133,7 @@ LanguageUnit_p cuda::Softmax::emit_function_signature()
 
 REGISTER_KERNEL_EMITTER(
     "Softmax",                                                                 // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs
     cuda::Softmax)                                                             // constructor
 
 cuda::SoftmaxGrad::SoftmaxGrad(shared_ptr<KernelContext> ctx)
@@ -267,5 +267,5 @@ LanguageUnit_p cuda::SoftmaxGrad::emit_function_signature()
 
 REGISTER_KERNEL_EMITTER(
     "SoftmaxGrad",                                                             // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cudnn_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs
     cuda::SoftmaxGrad)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/sparse_apply_momentum.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/sparse_apply_momentum.cpp
index 836d23f0b..063716257 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/sparse_apply_momentum.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/sparse_apply_momentum.cpp
@@ -127,5 +127,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("SparseApplyMomentum",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
                         cuda::SparseApplyMomentum)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/stop_gradient.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/stop_gradient.cpp
index c89bdfadb..6f7192da6 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/stop_gradient.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/stop_gradient.cpp
@@ -116,5 +116,5 @@ using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
     "StopGradient",                                                           // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::StopGradient)                                                       // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp
index d305cd12e..d796ef822 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp
@@ -114,5 +114,5 @@ LanguageUnit_p cuda::StridedSliceGrad::emit_dependency()
 }
 REGISTER_KERNEL_EMITTER(
     "StridedSliceGrad",                                                       // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::StridedSliceGrad)                                                   // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp
index 75d4ad303..4b22d7c25 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp
@@ -147,9 +147,9 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("Tile",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
                         cuda::Tile)
 
 REGISTER_KERNEL_EMITTER("Tile",                                                //op_name
-                        Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Priority(2), //attrs
+                        Device(ROCM_GPU).TypeConstraint(element::f32).Priority(2), //attrs
                         cuda::RocmTile)                                        // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/transpose.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/transpose.cpp
index 19ae56c60..7016e0518 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/transpose.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/transpose.cpp
@@ -144,5 +144,5 @@ using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
     "Transpose",                                                              // op_name
-    Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2), // attrs
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::Transpose)                                                          // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/unsorted_segment_sum.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/unsorted_segment_sum.cpp
index 782a4bf44..e816cd7b9 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/unsorted_segment_sum.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/unsorted_segment_sum.cpp
@@ -186,5 +186,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("UnsortedSegmentSum",
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel").Priority(2),
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
                         cuda::UnsortedSegmentSum)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp
index f829a7e6d..368e24241 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp
@@ -78,5 +78,5 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("Variable",                                            //op_name
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Priority(2), //attrs
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs
                         cuda::Variable)                                        // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/zeros.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/zeros.cpp
index 547994202..0a457b435 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/zeros.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/zeros.cpp
@@ -59,5 +59,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("Zeros",                                                      // op_name
-                        Device(CUDA_GPU).TypeConstraint(DT_FLOAT).Tag("cuda_kernel"), // attrs
+                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), // attrs
                         cuda::Zeros)                                                  // constructor
diff --git a/src/nnfusion/core/kernels/hlsl/constant.cpp b/src/nnfusion/core/kernels/hlsl/constant.cpp
index 43fd2ae60..61a701bf7 100644
--- a/src/nnfusion/core/kernels/hlsl/constant.cpp
+++ b/src/nnfusion/core/kernels/hlsl/constant.cpp
@@ -77,5 +77,5 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("Constant",
-                        Device(HLSL).TypeConstraint(DT_FLOAT).Tag("hlsl_kernel"),
+                        Device(HLSL).TypeConstraint(element::f32).Tag("hlsl_kernel"),
                         hlsl::Constant)
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/hlsl/parameter.cpp b/src/nnfusion/core/kernels/hlsl/parameter.cpp
index 1d261fe32..2ed1d7740 100644
--- a/src/nnfusion/core/kernels/hlsl/parameter.cpp
+++ b/src/nnfusion/core/kernels/hlsl/parameter.cpp
@@ -56,5 +56,5 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("Parameter",
-                        Device(HLSL).TypeConstraint(DT_FLOAT).Tag("hlsl_kernel"),
+                        Device(HLSL).TypeConstraint(element::f32).Tag("hlsl_kernel"),
                         hlsl::Parameter)
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/hlsl/result.cpp b/src/nnfusion/core/kernels/hlsl/result.cpp
index 6441da6eb..39f584e73 100644
--- a/src/nnfusion/core/kernels/hlsl/result.cpp
+++ b/src/nnfusion/core/kernels/hlsl/result.cpp
@@ -48,5 +48,5 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER("Result",
-                        Device(HLSL).TypeConstraint(DT_FLOAT).Tag("hlsl_kernel"),
+                        Device(HLSL).TypeConstraint(element::f32).Tag("hlsl_kernel"),
                         hlsl::Result)
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/kernel_registration.cpp b/src/nnfusion/core/kernels/kernel_registration.cpp
index ab6ce4d0b..f18d75b7d 100644
--- a/src/nnfusion/core/kernels/kernel_registration.cpp
+++ b/src/nnfusion/core/kernels/kernel_registration.cpp
@@ -3,6 +3,7 @@
 
 #include "kernel_registration.hpp"
 #include "nnfusion/util/util.hpp"
+#include "ngraph/src/nnfusion/common/type/element_type.hpp"
 
 using namespace nnfusion;
 using namespace nnfusion::kernels;
@@ -18,7 +19,7 @@ KernelRegistration& KernelRegistration::Device(const NNFusion_DeviceType device_
     return *this;
 }
 
-KernelRegistration& KernelRegistration::TypeConstraint(const DataType data_type)
+KernelRegistration& KernelRegistration::TypeConstraint(const element::Type data_type)
 {
     m_data_type = data_type;
     return *this;
@@ -72,7 +73,7 @@ shared_ptr<const KernelRegistration>
 }
 
 shared_ptr<const KernelRegistration> KernelRegistry::FindKernelRegistration(
-    const string op_name, const NNFusion_DeviceType& device_type, const DataType data_type)
+    const string op_name, const NNFusion_DeviceType& device_type, const element::Type data_type)
 {
     std::vector<shared_ptr<const KernelRegistration>> matched_regs;
     auto regs = m_kernel_registry.equal_range(op_name);
@@ -97,7 +98,7 @@ shared_ptr<const KernelRegistration> KernelRegistry::FindKernelRegistration(
 }
 
 std::vector<shared_ptr<const KernelRegistration>> KernelRegistry::FindKernelRegistrations(
-    const string op_name, const NNFusion_DeviceType& device_type, const DataType data_type)
+    const string op_name, const NNFusion_DeviceType& device_type, const element::Type data_type)
 {
     std::vector<shared_ptr<const KernelRegistration>> matched_regs;
     auto regs = m_kernel_registry.equal_range(op_name);
diff --git a/src/nnfusion/core/kernels/kernel_registration.hpp b/src/nnfusion/core/kernels/kernel_registration.hpp
index 07c70d9c5..db19454aa 100644
--- a/src/nnfusion/core/kernels/kernel_registration.hpp
+++ b/src/nnfusion/core/kernels/kernel_registration.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "kernel_emitter.hpp"
+#include "ngraph/src/nnfusion/common/type/element_type.hpp"
 
 namespace nnfusion
 {
@@ -24,7 +25,7 @@ namespace nnfusion
 
             // Specify the data (inputs/outputs) types this kernel supports
             // Return *this
-            KernelRegistration& TypeConstraint(const DataType data_type);
+            KernelRegistration& TypeConstraint(const element::Type data_type);
 
             // Add an arbitrary user-defined tag on the kernel to allow the operator
             // to choose this kernel
@@ -57,7 +58,7 @@ namespace nnfusion
             friend class KernelRegistry;
             string m_op_name;
             NNFusion_DeviceType m_device_type;
-            DataType m_data_type;
+            element::Type m_data_type;
             string m_tag;
             Factory m_factory;
             size_t m_priority = 0;
@@ -79,11 +80,11 @@ namespace nnfusion
             shared_ptr<const KernelRegistration>
                 FindKernelRegistration(const string op_name,
                                        const NNFusion_DeviceType& device_type,
-                                       const DataType data_type);
+                                       const element::Type data_type);
             std::vector<shared_ptr<const KernelRegistration>>
                 FindKernelRegistrations(const string op_name,
                                         const NNFusion_DeviceType& device_type,
-                                        const DataType data_type);
+                                        const element::Type data_type);
             shared_ptr<const KernelRegistration>
                 KernelSelect(std::vector<shared_ptr<const KernelRegistration>>& matched_regs);
 
diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
index 65ec30522..581ed3e03 100755
--- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
+++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
@@ -838,6 +838,7 @@ void CudaCodegenPass::create_main_file(std::shared_ptr<InterpreterContext> ctx,
     re_main->require(header::limits);
 
     re_main->require(header::cuda_prof_api);
+    re_main->require(header::cuda_fp16);
     re_main->require(macro::CUDA_SAFE_CALL);
 
     lu_main << "#include \"nnfusion_rt.h\"\n";
diff --git a/src/nnfusion/engine/pass/graph/kernel_selection.cpp b/src/nnfusion/engine/pass/graph/kernel_selection.cpp
index be465f93b..fcd3148c7 100644
--- a/src/nnfusion/engine/pass/graph/kernel_selection.cpp
+++ b/src/nnfusion/engine/pass/graph/kernel_selection.cpp
@@ -436,7 +436,7 @@ bool DefaultKernelSelector::register_antares_kernel()
             op_name,
             Name(op_name)
                 .Device(CUDA_GPU)
-                .TypeConstraint(DT_FLOAT)
+                .TypeConstraint(element::f32)
                 .Tag("antares")
                 .Priority(9)
                 .KernelFactory([](shared_ptr<KernelContext> context) -> shared_ptr<KernelEmitter> {
@@ -447,7 +447,7 @@ bool DefaultKernelSelector::register_antares_kernel()
             op_name,
             Name(op_name)
                 .Device(GENERIC_CPU)
-                .TypeConstraint(DT_FLOAT)
+                .TypeConstraint(element::f32)
                 .Tag("antares")
                 .Priority(9)
                 .KernelFactory([](shared_ptr<KernelContext> context) -> shared_ptr<KernelEmitter> {
@@ -458,7 +458,7 @@ bool DefaultKernelSelector::register_antares_kernel()
             op_name,
             Name(op_name)
                 .Device(HLSL)
-                .TypeConstraint(DT_FLOAT)
+                .TypeConstraint(element::f32)
                 .Tag("antares")
                 .Priority(9)
                 .KernelFactory([](shared_ptr<KernelContext> context) -> shared_ptr<KernelEmitter> {
diff --git a/src/nnfusion/frontend/tensorflow_import/ops/const.cpp b/src/nnfusion/frontend/tensorflow_import/ops/const.cpp
index 9dba39c21..1d8f5e926 100644
--- a/src/nnfusion/frontend/tensorflow_import/ops/const.cpp
+++ b/src/nnfusion/frontend/tensorflow_import/ops/const.cpp
@@ -284,56 +284,6 @@ namespace nnfusion
                 return true;
             }
 
-            /*
-            const std::map<tensorflow::DataType,
-                           std::pair<std::function<bool(const tensorflow::NodeDef&,
-                                                        nnfusion::element::Type,
-                                                        std::shared_ptr<nnfusion::op::Op>*)>,
-                                     const nnfusion::element::Type>>&
-                TF_NGRAPH_CONST_MAP()
-            {
-                static const std::map<
-                    tensorflow::DataType,
-                    std::pair<std::function<bool(const tensorflow::NodeDef&,
-                                                 nnfusion::element::Type,
-                                                 std::shared_ptr<nnfusion::op::Op>*)>,
-                              const nnfusion::element::Type>>
-                    the_map = {
-                        {tensorflow::DataType::DT_FLOAT,
-                         std::make_pair(MakeConstOp<float>, nnfusion::element::f32)},
-                        {tensorflow::DataType::DT_DOUBLE,
-                         std::make_pair(MakeConstOp<double>, nnfusion::element::f64)},
-                        {tensorflow::DataType::DT_INT8,
-                         std::make_pair(MakeConstOp<int8>, nnfusion::element::i8)},
-                        {tensorflow::DataType::DT_INT16,
-                         std::make_pair(MakeConstOp<int16>, nnfusion::element::i16)},
-                        // {tensorflow::DataType::DT_QINT8,
-                        //   std::make_pair(MakeConstOp<google::protobuf::qint8>, nnfusion::element::i8)},
-                        // {tensorflow::DataType::DT_QUINT16,
-                        //   std::make_pair(MakeConstOp<google::protobuf::quint8>, nnfusion::element::u8)},
-                        {tensorflow::DataType::DT_INT32,
-                         std::make_pair(MakeConstOp<int32>, nnfusion::element::i32)},
-                        {tensorflow::DataType::DT_INT64,
-                         std::make_pair(MakeConstOp<int64>, nnfusion::element::i64)},
-                        {tensorflow::DataType::DT_UINT8,
-                         std::make_pair(MakeConstOp<uint8>, nnfusion::element::u8)},
-                        {tensorflow::DataType::DT_UINT16,
-                         std::make_pair(MakeConstOp<uint16>, nnfusion::element::u16)},
-                        {tensorflow::DataType::DT_UINT32,
-                         std::make_pair(MakeConstOp<uint32>, nnfusion::element::u32)},
-                        {tensorflow::DataType::DT_UINT64,
-                         std::make_pair(MakeConstOp<uint64>, nnfusion::element::u64)},
-                        {tensorflow::DataType::DT_BOOL,
-                         std::make_pair(MakeConstOp<bool, char>, nnfusion::element::boolean)},
-                        {tensorflow::DataType::DT_STRING,
-                         std::make_pair(MakeConstOp<std::string, char>,
-                                        nnfusion::element::character)}};
-                // TODO: data type string unsupport now, bert model has string type const op used for assert
-
-                return the_map;
-            }
-            */
-
             NamedNodeVector TranslateConstOp(const tensorflow::NodeDef& node,
                                              const NodeMap& all_ng_nodes,
                                              std::shared_ptr<nnfusion::graph::Graph> m_graph)
@@ -363,20 +313,6 @@ namespace nnfusion
 
                 return ret;
             }
-
-            // const std::map<tensorflow::DataType, element::Type> TF_NGRAPH_CONST_MAP = {
-            //     {tensorflow::DataType::DT_FLOAT, nnfusion::element::f32},
-            //     {tensorflow::DataType::DT_DOUBLE, nnfusion::element::f64},
-            //     {tensorflow::DataType::DT_INT8, nnfusion::element::i8},
-            //     {tensorflow::DataType::DT_INT16, nnfusion::element::i16},
-            //     {tensorflow::DataType::DT_INT32, nnfusion::element::i32},
-            //     {tensorflow::DataType::DT_INT64, nnfusion::element::i64},
-            //     {tensorflow::DataType::DT_UINT8, nnfusion::element::u8},
-            //     {tensorflow::DataType::DT_UINT16, nnfusion::element::u16},
-            //     {tensorflow::DataType::DT_UINT32, nnfusion::element::u32},
-            //     {tensorflow::DataType::DT_UINT64, nnfusion::element::u64},
-            //     {tensorflow::DataType::DT_BOOL, nnfusion::element::boolean},
-            //     {tensorflow::DataType::DT_STRING, nnfusion::element::character}};
         } // namespace tensorflow_import
     }     // namespace frontend
 } // namespace nnfusion
diff --git a/test/main.cpp b/test/main.cpp
index d7c57d838..5c8ddaca8 100644
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -36,7 +36,7 @@ int main(int argc, char** argv)
                 op_name,
                 nnfusion::kernels::Name(op_name)
                     .Device(CUDA_GPU)
-                    .TypeConstraint(DT_FLOAT)
+                    .TypeConstraint(element::f32)
                     .Tag("antares")
                     .Priority(9)
                     .KernelFactory([](shared_ptr<nnfusion::kernels::KernelContext> context)

From 7215c91b2f5740d462a4da928a6a61e6a7b750e7 Mon Sep 17 00:00:00 2001
From: Cjkkkk <shanbinke@gmail.com>
Date: Thu, 26 Nov 2020 13:14:27 +0800
Subject: [PATCH 04/32] add pass info to log

---
 src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp            | 2 +-
 .../engine/pass/graph/batchnorm_inference_folding_pass.cpp    | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
index 973204f87..7cd431cc5 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
@@ -262,4 +262,4 @@ REGISTER_KERNEL_EMITTER(
 REGISTER_KERNEL_EMITTER(
     "Dot",                                                               // op_name
     Device(ROCM_GPU).TypeConstraint(DT_FLOAT).Tag("cublas").Priority(2), // attrs
-    cuda::Dot)                                                           // constructor
+    cuda::Dot)                                                           // constructor
\ No newline at end of file
diff --git a/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp b/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp
index ab1a671fe..1dc3b5521 100644
--- a/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp
+++ b/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp
@@ -886,6 +886,8 @@ bool BatchNormInferenceFoldingPass::run_on_graph(std::shared_ptr<nnfusion::graph
     bool folding_flag = FLAGS_fbatchnorm_inference_folding;
     if (folding_flag)
     {
+        NNFUSION_LOG(INFO) << "batchnorm inference folding Pass starts up for Graph: "
+                                       << graph->get_name();
         for (auto pattern : BN_FOLDING_PATTERNS)
         {
             BatchNormInferenceOptimizer optimizer(graph, pattern);
@@ -896,6 +898,8 @@ bool BatchNormInferenceFoldingPass::run_on_graph(std::shared_ptr<nnfusion::graph
             auto const_folding_optimizer = RuntimeConstantFoldingPass();
             const_folding_optimizer.run_on_graph(graph);
         }
+        NNFUSION_LOG(INFO) << "batchnorm inference folding Pass ends for Graph: "
+                                       << graph->get_name();
     }
     return true;
 }
\ No newline at end of file

From 2e757837da2386dd169ac83eb5d9fda2aead1f9b Mon Sep 17 00:00:00 2001
From: Cjkkkk <shanbinke@gmail.com>
Date: Thu, 26 Nov 2020 13:41:10 +0800
Subject: [PATCH 05/32] update cudnn datatype mapping

---
 src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp      | 10 +++++-----
 src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp      |  8 ++++----
 src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp |  2 +-
 src/nnfusion/engine/pass/graph/kernel_fusion_pass.cpp  |  4 ++--
 src/nnfusion/engine/pass/graph/kernel_selection.cpp    |  8 ++++----
 .../engine/pass/graph/runtime_const_folding_pass.cpp   |  8 ++++----
 src/nnfusion/engine/profiler/cpu_runtime.cpp           |  2 +-
 src/nnfusion/engine/profiler/profiler.cpp              |  2 +-
 src/nnfusion/frontend/util/evaluator.hpp               |  6 +++---
 test/nnfusion/engine/profiler.cpp                      |  2 +-
 test/nnfusion/kernels/sample.cpp                       |  2 +-
 test/nnfusion/test_util/common.hpp                     |  2 +-
 12 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp
index b0ebd6c8b..f8891b025 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp
@@ -17,7 +17,7 @@ std::vector<int> cuda::compute_strides(const std::vector<int>& shape)
     return strides;
 }
 
-std::string cuda::get_cudnn_datatype(std::string dtype)
+std::string cuda::get_cudnn_datatype(element::Type dtype)
 {
     static const std::unordered_map<std::string, std::string> datatype_map{
         {"half", "CUDNN_DATA_HALF"},
@@ -25,13 +25,13 @@ std::string cuda::get_cudnn_datatype(std::string dtype)
         {"double", "CUDNN_DATA_DOUBLE"},
         {"int8_t", "CUDNN_DATA_INT8"},
         {"int32_t", "CUDNN_DATA_INT32"}};
-    auto p = datatype_map.find(dtype);
+    auto p = datatype_map.find(dtype.c_type_string());
     NNFUSION_CHECK(p != datatype_map.end()) << dtype << " is not supported by cuDNN";
 
     return p->second;
 }
 
-LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape, string desc, string type)
+LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape, string desc, element::Type type)
 {
     LanguageUnit_p _lu(new LanguageUnit);
     auto& lu = *_lu;
@@ -92,7 +92,7 @@ LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& s
     return _lu;
 }
 
-LanguageUnit_p cuda::get_cudnn_filter_descriptor(const Shape& shape, string desc, string type)
+LanguageUnit_p cuda::get_cudnn_filter_descriptor(const Shape& shape, string desc, element::Type type)
 {
     LanguageUnit_p _lu(new LanguageUnit);
     auto& lu = *_lu;
@@ -145,7 +145,7 @@ LanguageUnit_p cuda::get_cudnn_convolution_descriptor(const Shape& padding,
                                                       const Strides& window_movement_strides,
                                                       const Strides& window_dilation_strides,
                                                       string desc,
-                                                      string type)
+                                                      element::Type type)
 {
     LanguageUnit_p _lu(new LanguageUnit);
     auto& lu = *_lu;
diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp
index 63d63ce66..fc8cdd8dc 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp
@@ -13,16 +13,16 @@ namespace nnfusion
         namespace cuda
         {
             std::vector<int> compute_strides(const std::vector<int>& shape);
-            std::string get_cudnn_datatype(std::string dtype);
+            std::string get_cudnn_datatype(element::Type type);
             LanguageUnit_p cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape,
                                                               string desc,
-                                                              string type = "float");
+                                                              element::Type type = element::f32);
             LanguageUnit_p get_cudnn_convolution_descriptor(const Shape& padding,
                                                             const Strides& window_movement_strides,
                                                             const Strides& window_dilation_strides,
                                                             string desc,
-                                                            string type = "float");
-            LanguageUnit_p get_cudnn_filter_descriptor(const Shape& shape, string desc, string type = "float");
+                                                            element::Type type = element::f32);
+            LanguageUnit_p get_cudnn_filter_descriptor(const Shape& shape, string desc, element::Type type = element::f32);
             LanguageUnit_p get_dropout_global_states(float ratio);
             inline std::string ratio2str(float ratio)
             {
diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
index 581ed3e03..cf8e3d780 100755
--- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
+++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
@@ -345,7 +345,7 @@ std::vector<std::pair<string, vector<nnfusion::ir::Instruction::Pointer>>>
             else
             {
                 auto kernel_reg = KernelRegistry::Global()->FindKernelRegistration(
-                    "AnyOP", device_type(), DT_FLOAT);
+                    "AnyOP", device_type(), element::f32);
                 NNFUSION_CHECK(kernel_reg != nullptr) << "AnyOp Kernel not found, op="
                                                       << ins->getGNode()->get_op_type();
                 shared_ptr<KernelContext> ctx(new KernelContext(ins->getGNode()));
diff --git a/src/nnfusion/engine/pass/graph/kernel_fusion_pass.cpp b/src/nnfusion/engine/pass/graph/kernel_fusion_pass.cpp
index 66a649307..f5112c3b6 100644
--- a/src/nnfusion/engine/pass/graph/kernel_fusion_pass.cpp
+++ b/src/nnfusion/engine/pass/graph/kernel_fusion_pass.cpp
@@ -649,12 +649,12 @@ class KernelFuseOptimizer
                         if (n_device_type != GENERIC_CPU)
                         {
                             kernel_reg = KernelRegistry::Global()->FindKernelRegistration(
-                                "ElementWiseFused", CUDA_GPU, DT_FLOAT);
+                                "ElementWiseFused", CUDA_GPU, element::f32);
                         }
                         else
                         {
                             kernel_reg = KernelRegistry::Global()->FindKernelRegistration(
-                                "ElementwiseFused", GENERIC_CPU, DT_FLOAT);
+                                "ElementwiseFused", GENERIC_CPU, element::f32);
                         }
                         NNFUSION_CHECK_NOT_NULLPTR(kernel_reg);
                         auto ctx = std::make_shared<KernelContext>();
diff --git a/src/nnfusion/engine/pass/graph/kernel_selection.cpp b/src/nnfusion/engine/pass/graph/kernel_selection.cpp
index fcd3148c7..7b212ee2f 100644
--- a/src/nnfusion/engine/pass/graph/kernel_selection.cpp
+++ b/src/nnfusion/engine/pass/graph/kernel_selection.cpp
@@ -26,7 +26,7 @@ pair<NNFusion_DeviceType, kernels::KernelEmitter::Pointer>
                                                  IProfilingRuntime::Pointer runtime)
 {
     std::vector<shared_ptr<const KernelRegistration>> kernel_regs =
-        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, DT_FLOAT);
+        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, element::f32);
 
     // Skip since only one candidate or constant
     if (kernel_regs.size() == 1 || gnode->is_constant())
@@ -143,12 +143,12 @@ pair<NNFusion_DeviceType, kernels::KernelEmitter::Pointer>
 {
     shared_ptr<KernelContext> ctx(new KernelContext(gnode));
     std::vector<shared_ptr<const KernelRegistration>> kernel_regs =
-        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, DT_FLOAT);
+        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, element::f32);
 
     if (devtype == ROCM_GPU)
     {
         for (auto it : KernelRegistry::Global()->FindKernelRegistrations(
-                 gnode->get_op_type(), CUDA_GPU, DT_FLOAT))
+                 gnode->get_op_type(), CUDA_GPU, element::f32))
             kernel_regs.push_back(it);
     }
 
@@ -355,7 +355,7 @@ pair<NNFusion_DeviceType, kernels::KernelEmitter::Pointer>
                                         NNFusion_DeviceType devtype)
 {
     std::vector<shared_ptr<const KernelRegistration>> kernel_regs =
-        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, DT_FLOAT);
+        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, element::f32);
     shared_ptr<KernelContext> ctx(new KernelContext(gnode));
     std::vector<std::string> functions;
 
diff --git a/src/nnfusion/engine/pass/graph/runtime_const_folding_pass.cpp b/src/nnfusion/engine/pass/graph/runtime_const_folding_pass.cpp
index 115da8c53..1dcb108e6 100644
--- a/src/nnfusion/engine/pass/graph/runtime_const_folding_pass.cpp
+++ b/src/nnfusion/engine/pass/graph/runtime_const_folding_pass.cpp
@@ -98,24 +98,24 @@ int RuntimeConstantFoldingPass::runtime_const_folding_iterate_once(
             runtime = nnfusion::profiler::RocmDefaultRuntime::Runtime();
             NNFUSION_CHECK(runtime->check_env());
             kernel_regs = KernelRegistry::Global()->FindKernelRegistrations(
-                it->get_op_type(), ROCM_GPU, DT_FLOAT);
+                it->get_op_type(), ROCM_GPU, element::f32);
             if (kernel_regs.size() == 0)
                 kernel_regs = KernelRegistry::Global()->FindKernelRegistrations(
-                    it->get_op_type(), CUDA_GPU, DT_FLOAT);
+                    it->get_op_type(), CUDA_GPU, element::f32);
         }
         else if (backend == "CUDA")
         {
             runtime = nnfusion::profiler::CudaDefaultRuntime::Runtime();
             NNFUSION_CHECK(runtime->check_env());
             kernel_regs = KernelRegistry::Global()->FindKernelRegistrations(
-                it->get_op_type(), CUDA_GPU, DT_FLOAT);
+                it->get_op_type(), CUDA_GPU, element::f32);
         }
         else if (backend == "CPU")
         {
             runtime = nnfusion::profiler::ReferenceRuntime::Runtime();
             NNFUSION_CHECK(runtime->check_env());
             kernel_regs = KernelRegistry::Global()->FindKernelRegistrations(
-                it->get_op_type(), GENERIC_CPU, DT_FLOAT);
+                it->get_op_type(), GENERIC_CPU, element::f32);
         }
         else
         {
diff --git a/src/nnfusion/engine/profiler/cpu_runtime.cpp b/src/nnfusion/engine/profiler/cpu_runtime.cpp
index ff505f86f..a7e6ebd97 100644
--- a/src/nnfusion/engine/profiler/cpu_runtime.cpp
+++ b/src/nnfusion/engine/profiler/cpu_runtime.cpp
@@ -226,7 +226,7 @@ double ReferenceRuntime::invoke(const ProfilingContext::Pointer& ke, void** inpu
     auto& gnode = ke->kernel->m_context->gnode;
     std::vector<shared_ptr<const KernelRegistration>> kernel_regs =
         KernelRegistry::Global()->FindKernelRegistrations(
-            gnode->get_op_type(), GENERIC_CPU, DT_FLOAT);
+            gnode->get_op_type(), GENERIC_CPU, element::f32);
     shared_ptr<KernelContext> ctx(new KernelContext(gnode));
 
     bool has_valid_kernel = false;
diff --git a/src/nnfusion/engine/profiler/profiler.cpp b/src/nnfusion/engine/profiler/profiler.cpp
index 469297f6c..174c3c108 100644
--- a/src/nnfusion/engine/profiler/profiler.cpp
+++ b/src/nnfusion/engine/profiler/profiler.cpp
@@ -82,7 +82,7 @@ void GraphEvaluate::create_profiling_contexts(shared_ptr<GNode> gnode)
         return;
     }
     std::vector<shared_ptr<const KernelRegistration>> kernel_regs =
-        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), dev_type, DT_FLOAT);
+        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), dev_type, element::f32);
     shared_ptr<KernelContext> ctx(new KernelContext(gnode));
 
     for (auto kernel_reg : kernel_regs)
diff --git a/src/nnfusion/frontend/util/evaluator.hpp b/src/nnfusion/frontend/util/evaluator.hpp
index 4ec3f6282..1e5a56e36 100644
--- a/src/nnfusion/frontend/util/evaluator.hpp
+++ b/src/nnfusion/frontend/util/evaluator.hpp
@@ -109,17 +109,17 @@ namespace nnfusion
                 if (runtime->check_env())
                 {
                     kernel_regs = KernelRegistry::Global()->FindKernelRegistrations(
-                        gnode->get_op_type(), ROCM_GPU, DT_FLOAT);
+                        gnode->get_op_type(), ROCM_GPU, element::f32);
                     if (kernel_regs.size() == 0)
                         kernel_regs = KernelRegistry::Global()->FindKernelRegistrations(
-                            gnode->get_op_type(), CUDA_GPU, DT_FLOAT);
+                            gnode->get_op_type(), CUDA_GPU, element::f32);
                 }
                 else
                 {
                     runtime = nnfusion::profiler::CudaDefaultRuntime::Runtime();
                     NNFUSION_CHECK(runtime->check_env());
                     kernel_regs = KernelRegistry::Global()->FindKernelRegistrations(
-                        gnode->get_op_type(), CUDA_GPU, DT_FLOAT);
+                        gnode->get_op_type(), CUDA_GPU, element::f32);
                 }
 
                 bool const_infer_success = false;
diff --git a/test/nnfusion/engine/profiler.cpp b/test/nnfusion/engine/profiler.cpp
index d83639ea2..cffc74d1c 100644
--- a/test/nnfusion/engine/profiler.cpp
+++ b/test/nnfusion/engine/profiler.cpp
@@ -25,7 +25,7 @@ TEST(nnfusion_engine_profiler, basic_utils)
 
     // Filter out the kernels meeting the requirement;
     std::vector<shared_ptr<const KernelRegistration>> kernel_regs =
-        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), CUDA_GPU, DT_FLOAT);
+        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), CUDA_GPU, element::f32);
     shared_ptr<KernelContext> ctx(new KernelContext(gnode));
 
     // Gnerate Test data
diff --git a/test/nnfusion/kernels/sample.cpp b/test/nnfusion/kernels/sample.cpp
index da51ff135..367e6173c 100644
--- a/test/nnfusion/kernels/sample.cpp
+++ b/test/nnfusion/kernels/sample.cpp
@@ -24,7 +24,7 @@ TEST(nnfusion_core_kernels, sample)
 
     // Filter out the kernels meeting the requirement;
     std::vector<shared_ptr<const KernelRegistration>> kernel_regs =
-        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), CUDA_GPU, DT_FLOAT);
+        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), CUDA_GPU, element::f32);
     shared_ptr<KernelContext> ctx(new KernelContext(gnode));
 
     EXPECT_GT(kernel_regs.size(), 0);
diff --git a/test/nnfusion/test_util/common.hpp b/test/nnfusion/test_util/common.hpp
index 50bf4dd99..a020b3183 100644
--- a/test/nnfusion/test_util/common.hpp
+++ b/test/nnfusion/test_util/common.hpp
@@ -74,7 +74,7 @@ namespace nnfusion
             }
             std::vector<shared_ptr<const KernelRegistration>> available_kernels =
                 KernelRegistry::Global()->FindKernelRegistrations(
-                    gnode->get_op_type(), dev_t, DT_FLOAT);
+                    gnode->get_op_type(), dev_t, element::f32);
             shared_ptr<KernelContext> ctx(new KernelContext(gnode));
             bool kernel_found = false;
             for (auto& kernel_reg : available_kernels)

From 0ac891fe1f42bf307da0b4c7eae396661c5b0c4b Mon Sep 17 00:00:00 2001
From: Cjkkkk <shanbinke@gmail.com>
Date: Thu, 26 Nov 2020 13:54:42 +0800
Subject: [PATCH 06/32] add fp16 header to header file

---
 src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp   | 2 +-
 src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
index 62f1bf6cc..9ced8d459 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
@@ -14,7 +14,7 @@ LU_DEFINE(header::cudnn, "#include <cudnn.h>\n");
 LU_DEFINE(header::super_scaler, "#include \"super_scaler.h\"\n");
 LU_DEFINE(header::cupti, "#include <cupti.h>\n");
 LU_DEFINE(header::cuda_prof_api, "#include <cuda_profiler_api.h>\n");
-LU_DEFINE(header::cuda_fp16, "#include <cuda_fp16.h>");
+LU_DEFINE(header::cuda_fp16, "#include <cuda_fp16.h>\n");
 
 // Macro
 LU_DEFINE(
diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
index cf8e3d780..fd1c3b456 100755
--- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
+++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
@@ -803,6 +803,8 @@ void CudaCodegenPass::create_header_file(std::shared_ptr<InterpreterContext> ctx
     lu_header << declaration::typedef_int->get_code() << "\n";
     if (device_type() == CUDA_GPU || device_type() == ROCM_GPU)
         lu_header << header::cuda->get_code();
+        // TODO only include this if half is used
+        lu_header << header::cuda_fp16->get_code();
 
     lu_header << "extern \"C\" int kernel_entry(";
     std::string params = get_kernel_entry_paras(tu);

From 1e38b5922a9d6b0e844c9332ac304d1369069236 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Fri, 27 Nov 2020 09:00:51 +0000
Subject: [PATCH 07/32] vgg11 runnable

---
 .../core/kernels/cuda_gpu/cuda_langunit.cpp   |  20 ++
 .../core/kernels/cuda_gpu/cuda_langunit.hpp   |   1 +
 .../core/kernels/cuda_gpu/kernels/dot.cpp     | 216 ++++++++++++------
 .../kernels/cuda_gpu/kernels/max_pool.cpp     |  21 +-
 .../kernels/cuda_gpu/kernels/max_pool.hpp     |   4 +-
 .../pass/graph/codegen_dxcompute_pass.hpp     |  14 +-
 .../pass/graph/codegen_graphcore_pass.hpp     |  14 +-
 test/nnfusion/kernels/batch_test.cpp          | 162 ++++++-------
 .../ngraph/src/nnfusion/common/type_info.cpp  |   1 +
 .../ngraph/src/nnfusion/common/type_info.hpp  |  28 +--
 10 files changed, 292 insertions(+), 189 deletions(-)

diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
index 9ced8d459..565c18c49 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
@@ -253,6 +253,26 @@ __device__ __forceinline__ int64_t  load(const int64_t*  __restrict__ in, int i=
 }
 )");
 
+LU_DEFINE(
+  declaration::cuda_fp16_scale,
+  R"(
+__global__ void nnfusionHalfScaleKernel(half *x, half *alpha, size_t count)
+{
+    size_t offset = threadIdx.x + blockIdx.x * blockDim.x;
+    x += offset;
+    if (offset < count)
+    {
+        *x *= *alpha;
+    }
+}
+
+void nnfusionHalfScale(half *x, half *alpha, size_t len)
+{
+    nnfusionHalfScaleKernel<<<(len+255)/256, 256>>>(x, alpha, len);
+}
+  )"
+)
+
 LU_DEFINE_EXTEND(declaration::cuda_reduce_primitive,
                  R"(
 #if CUDA_VERSION < 9000
diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp
index ecaeb6c89..93dbc0243 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp
@@ -43,6 +43,7 @@ namespace nnfusion
             LU_DECLARE(num_SMs);
             LU_DECLARE(cuda_reduce_primitive);
             LU_DECLARE(cuda_layer_norm);
+            LU_DECLARE(cuda_fp16_scale);
         } // namespace declaration
     }     // namespace kernels
 } // namespace nnfusion
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
index 78c09b6b3..2f07207c3 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
@@ -204,83 +204,163 @@ LanguageUnit_p cuda::Dot::emit_function_body()
                << " " << n << "));\n";
         }
     } else if (dtype == element::f16) {
-        size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
-        size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
-        size_t axes_for_k_count = reduction_axes;
-        size_t m = 1;
-        size_t n = 1;
-        size_t k = 1;
-
-        // check if input and output size correct
-        // check and calculate k for arg0 and arg1
-        size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
-        size_t arg1_k_idx = 0;                // first axe in arg1 for k
-
-        for (size_t i = 0; i < axes_for_k_count; i++)
-        {
-            k *= arg0_shape[arg0_k_idx];
-            if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
+        // case 1: Scalar * Tensor
+        // if (arg0_shape.empty() || arg1_shape.empty())
+        // {
+        //     auto& second = (arg0_shape.empty() ? arg1_shape : arg0_shape);
+        //     size_t count = nnfusion::shape_size(second);
+
+        //     string firstarg = (arg0_shape.empty() ? "input1" : "input0");
+        //     string secondarg = (arg0_shape.empty() ? "input0" : "input1");
+
+        //     lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
+
+        //     lu << "CUDA_SAFE_CALL(cudaMemcpy(outupt0, " << firstarg << ", " << count << ", cudaMemcpyDeviceToDevice));\n";     // copy `firstarg` to `output0`
+        //     lu << "CUBLAS_SAFE_CALL(nnfusionHalfScale(" << secondarg << ", output0, " << count << "));\n";
+        // }
+        // // case 2: 1d Dot
+        // else if ((arg0_shape.size() == arg1_shape.size()) && (arg0_shape.size() == reduction_axes))
+        // {
+        //     for (int i = 0; i < arg0_shape.size(); i++)
+        //     {
+        //         if (arg0_shape[i] != arg1_shape[i])
+        //         {
+        //             std::vector<std::string> arg_vec{"arg0", "arg1"};
+        //             std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
+
+        //             NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+        //                                 << nnfusion::join(shape_vec) << " respectively, at Node "
+        //                                 << m_context->gnode->get_name()
+        //                                 << ", do not match for dot op";
+        //         }
+        //     }
+
+        //     size_t count = nnfusion::shape_size(arg0_shape);
+        //     lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
+
+        //     lu << "CUBLAS_SAFE_CALL(cublasSdot(cublas_handle, " << count
+        //     << ", static_cast<const float*>(input0), 1, static_cast<const float*>(input1), 1, "
+        //         "static_cast<float*>(output0)));\n";
+        // }
+        // // matrix * vector
+        // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1))
+        // {
+        //     lu << "const float alpha = 1.0;\n const float beta = 0;\n";
+        //     lu << "CUBLAS_SAFE_CALL(cublasSgemv(cublas_handle, ";
+        //     if (trans_A)
+        //         lu << "CUBLAS_OP_N, " << arg0_shape[0] << ", " << arg0_shape[1] << ", ";
+        //     else
+        //         lu << "CUBLAS_OP_T, " << arg0_shape[1] << ", " << arg0_shape[0] << ", ";
+        //     lu << " &alpha,"
+        //     << " static_cast<const float*>(input0)," << arg0_shape[1] << ", "
+        //     << " static_cast<const float*>(input1),"
+        //     << " 1,"
+        //     << " &beta,"
+        //     << " static_cast<float*>(output0),"
+        //     << " 1));\n";
+        // }
+        // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) &&
+        //         (trans_A || trans_B))
+        // {
+        //     int m = trans_B ? arg1_shape[0] : arg1_shape[1];
+        //     int n = trans_A ? arg0_shape[1] : arg0_shape[0];
+        //     int k = trans_A ? arg0_shape[0] : arg0_shape[1];
+
+        //     lu << "const half alpha = 1.0;\nconst half beta = 0;\n";
+
+        //     lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
+        //     << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,")
+        //     << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << ","
+        //     << " " << n << ","
+        //     << " " << k << ","
+        //     << " &alpha,"
+        //     << " static_cast<const half*>(input1),"
+        //     << " " << arg1_shape[1] << ","
+        //     << " static_cast<const half*>(input0),"
+        //     << " " << arg0_shape[1] << ","
+        //     << " &beta,"
+        //     << " static_cast<half*>(output0),"
+        //     << " " << m << "));\n";
+        // } else {
+            size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
+            size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
+            size_t axes_for_k_count = reduction_axes;
+            size_t m = 1;
+            size_t n = 1;
+            size_t k = 1;
+
+            // check if input and output size correct
+            // check and calculate k for arg0 and arg1
+            size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
+            size_t arg1_k_idx = 0;                // first axe in arg1 for k
+
+            for (size_t i = 0; i < axes_for_k_count; i++)
             {
-                std::vector<std::string> arg_vec{"arg0", "arg1"};
-                std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
+                k *= arg0_shape[arg0_k_idx];
+                if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
+                {
+                    std::vector<std::string> arg_vec{"arg0", "arg1"};
+                    std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
 
-                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                        << nnfusion::join(shape_vec) << " respectively, at Node "
-                                        << m_context->gnode->get_name()
-                                        << ", do not match for dot op";
+                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                            << nnfusion::join(shape_vec) << " respectively, at Node "
+                                            << m_context->gnode->get_name()
+                                            << ", do not match for dot op";
+                }
             }
-        }
-        // check and calculate m for arg0 and out
-        size_t arg0_m_idx = 0; // first axe in arg0 for m
-        size_t out_m_idx = 0;  // first axe in out for m
-        for (size_t i = 0; i < axes_for_m_count; i++)
-        {
-            m *= arg0_shape[arg0_m_idx];
-            if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
+            // check and calculate m for arg0 and out
+            size_t arg0_m_idx = 0; // first axe in arg0 for m
+            size_t out_m_idx = 0;  // first axe in out for m
+            for (size_t i = 0; i < axes_for_m_count; i++)
             {
-                std::vector<std::string> arg_vec{"arg0", "output"};
-                std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
+                m *= arg0_shape[arg0_m_idx];
+                if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
+                {
+                    std::vector<std::string> arg_vec{"arg0", "output"};
+                    std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
 
-                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                        << nnfusion::join(shape_vec) << " respectively, at Node "
-                                        << m_context->gnode->get_name()
-                                        << ", do not match for dot op";
+                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                            << nnfusion::join(shape_vec) << " respectively, at Node "
+                                            << m_context->gnode->get_name()
+                                            << ", do not match for dot op";
+                }
             }
-        }
-        // check and calculate n for arg1 and out
-        size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
-        size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
-        for (size_t i = 0; i < axes_for_n_count; i++)
-        {
-            n *= arg1_shape[arg1_n_idx];
-            if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
+            // check and calculate n for arg1 and out
+            size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
+            size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
+            for (size_t i = 0; i < axes_for_n_count; i++)
             {
-                std::vector<std::string> arg_vec{"arg1", "output"};
-                std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
+                n *= arg1_shape[arg1_n_idx];
+                if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
+                {
+                    std::vector<std::string> arg_vec{"arg1", "output"};
+                    std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
 
-                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                        << nnfusion::join(shape_vec) << " respectively, at Node "
-                                        << m_context->gnode->get_name()
-                                        << ", do not match for dot op";
+                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                            << nnfusion::join(shape_vec) << " respectively, at Node "
+                                            << m_context->gnode->get_name()
+                                            << ", do not match for dot op";
+                }
             }
-        }
 
-        lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
-
-        lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
-            << " CUBLAS_OP_N,"
-            << " CUBLAS_OP_N,"
-            << " " << n << ","
-            << " " << m << ","
-            << " " << k << ","
-            << " &alpha,"
-            << " static_cast<const half*>(input1),"
-            << " " << n << ","
-            << " static_cast<const half*>(input0),"
-            << " " << k << ","
-            << " &beta,"
-            << " static_cast<half*>(output0),"
-            << " " << n << "));\n";
+            lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
+
+            lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
+                << " CUBLAS_OP_N,"
+                << " CUBLAS_OP_N,"
+                << " " << n << ","
+                << " " << m << ","
+                << " " << k << ","
+                << " &alpha,"
+                << " static_cast<const half*>(input1),"
+                << " " << n << ","
+                << " static_cast<const half*>(input0),"
+                << " " << k << ","
+                << " &beta,"
+                << " static_cast<half*>(output0),"
+                << " " << n << "));\n";
+        // }
+        
     } else {
         NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot.";
     }
@@ -296,6 +376,8 @@ LanguageUnit_p cuda::Dot::emit_dependency()
     _lu->require(header::stdexcept);
     _lu->require(header::sstream);
     _lu->require(macro::CUBLAS_SAFE_CALL);
+    _lu->require(macro::CUDA_SAFE_CALL);
+    _lu->require(declaration::cuda_fp16_scale);
     //_lu->require(declaration::cublas_handle);
     return _lu;
 }
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp
index ed603d218..76af4bb3e 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp
@@ -23,15 +23,15 @@ cuda::MaxPool1D::MaxPool1D(shared_ptr<KernelContext> ctx)
     input_width = input_shape.back();
     output_width = output_shape.back();
 
-    input_type = ctx->inputs[0]->get_element_type().c_type_string();
-    output_type = ctx->outputs[0]->get_element_type().c_type_string();
+    input_type = ctx->inputs[0]->get_element_type();
+    output_type = ctx->outputs[0]->get_element_type();
 
     // NNFUSION_CHECK(input_shape.size() == 3)
     //     << "Input shape size of MaxPool1D is invalid, shape size: " << input_shape.size()
     //     << "expected 3";
 
     std::stringstream tag;
-    tag << "cuda_maxpool_" << input_type << "_" << output_type << "_iw"
+    tag << "cuda_maxpool_" << input_type.c_type_string() << "_" << output_type.c_type_string() << "_iw"
         << std::to_string(input_width) << "_ow" << std::to_string(output_width) << "_ww"
         << std::to_string(window_width) << "_wst" << std::to_string(window_stride_width);
     custom_tag = tag.str();
@@ -53,11 +53,11 @@ LanguageUnit_p cuda::MaxPool1D::emit_function_body()
         // Index into input tensor.
         lu << "size_t start = (tid / " << output_width << ") * " << input_width << " + "
            << " (tid % " << output_width << ") * " << window_stride[0] << ";\n";
-        lu << input_type << " max_val = " << TypeInfo::Get(input_type)->lowest() << ";\n";
+        lu << input_type.c_type_string() << " max_val = " << TypeInfo::Get(input_type)->lowest() << ";\n";
         lu << "for (size_t i = start; i < start + " << window_width << "; i++)\n";
         lu.block_begin();
         {
-            lu << "const " << input_type << " input = input0[i];\n";
+            lu << "const " << input_type.c_type_string() << " input = input0[i];\n";
             lu << "if (input > max_val)\n";
             lu.block_begin();
             {
@@ -98,6 +98,8 @@ cuda::MaxPoolmD::MaxPoolmD(shared_ptr<KernelContext> ctx)
     : CudaLibEmitter(ctx)
 {
     auto max_pool = static_pointer_cast<nnfusion::op::MaxPool>(ctx->gnode->get_op_ptr());
+    input_type = ctx->inputs[0]->get_element_type();
+    output_type = ctx->outputs[0]->get_element_type();
     input_shape = nnfusion::Shape(ctx->inputs[0]->get_shape());
     output_shape = nnfusion::Shape(ctx->outputs[0]->get_shape());
     window_shape = nnfusion::Shape(max_pool->get_window_shape());
@@ -105,11 +107,8 @@ cuda::MaxPoolmD::MaxPoolmD(shared_ptr<KernelContext> ctx)
     padding_above = nnfusion::Shape(max_pool->get_padding_above());
     window_stride = nnfusion::Strides(max_pool->get_window_movement_strides());
 
-    input_type = ctx->inputs[0]->get_element_type().c_type_string();
-    output_type = ctx->outputs[0]->get_element_type().c_type_string();
-
     std::stringstream tag;
-    tag << "cudnn_maxpool_dtype_" << output_type << "_i" << join(input_shape, "_") << "_o"
+    tag << "cudnn_maxpool_dtype_" << output_type.c_type_string() << "_i" << join(input_shape, "_") << "_o"
         << join(output_shape, "_") << "_ws" << join(window_shape, "_") << "_wst"
         << join(window_stride, "_") << "_pb" << join(padding_below, "_") << "_pb"
         << join(padding_above, "_");
@@ -124,8 +123,8 @@ LanguageUnit_p cuda::MaxPoolmD::emit_function_body()
     LanguageUnit_p _lu(new LanguageUnit(get_function_name()));
     auto& lu = *_lu;
 
-    auto input_desc = cudnn_tensor_descriptor_from_shape(input_shape, "input_desc");
-    auto output_desc = cudnn_tensor_descriptor_from_shape(output_shape, "output_desc");
+    auto input_desc = cudnn_tensor_descriptor_from_shape(input_shape, "input_desc", input_type);
+    auto output_desc = cudnn_tensor_descriptor_from_shape(output_shape, "output_desc", output_type);
     lu << input_desc->get_code();
     lu << output_desc->get_code();
 
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.hpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.hpp
index 3a14bdf1d..1be7a15af 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.hpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.hpp
@@ -24,8 +24,8 @@ namespace nnfusion
                 shared_ptr<KernelContext> kernel_ctx;
                 nnfusion::Shape input_shape, output_shape, window_shape, padding_below,
                     padding_above;
+                element::Type input_type, output_type;
                 nnfusion::Strides window_stride;
-                string input_type, output_type;
 
                 size_t window_width, window_stride_width, input_width, output_width;
             };
@@ -41,10 +41,10 @@ namespace nnfusion
                 bool require_cudnn_handle() override { return true; }
             private:
                 shared_ptr<KernelContext> kernel_ctx;
+                element::Type input_type, output_type;
                 nnfusion::Shape input_shape, output_shape, window_shape, padding_below,
                     padding_above;
                 nnfusion::Strides window_stride;
-                string input_type, output_type;
             };
         } // namespace cuda
     }     // namespace kernels
diff --git a/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp b/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp
index 35b80d1cd..1779ad827 100644
--- a/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp
+++ b/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp
@@ -66,13 +66,13 @@ namespace nnfusion
                     return result.str();
                 }
 
-                inline int get_type_id(nnfusion::element::Type type)
-                {
-                    // TODO: fill more type cases
-                    if (type == nnfusion::element::f32)
-                        return DT_FLOAT;
-                    throw std::runtime_error("Not supported element type.");
-                }
+                // inline int get_type_id(nnfusion::element::Type type)
+                // {
+                //     // TODO: fill more type cases
+                //     if (type == nnfusion::element::f32)
+                //         return DT_FLOAT;
+                //     throw std::runtime_error("Not supported element type.");
+                // }
 
                 template <class T>
                 inline std::shared_ptr<T> get_op_object(std::shared_ptr<GNode>& curr)
diff --git a/src/nnfusion/engine/pass/graph/codegen_graphcore_pass.hpp b/src/nnfusion/engine/pass/graph/codegen_graphcore_pass.hpp
index bd272c0bf..45a8b389d 100644
--- a/src/nnfusion/engine/pass/graph/codegen_graphcore_pass.hpp
+++ b/src/nnfusion/engine/pass/graph/codegen_graphcore_pass.hpp
@@ -66,13 +66,13 @@ namespace nnfusion
                     return result.str();
                 }
 
-                inline int get_type_id(nnfusion::element::Type type)
-                {
-                    // TODO: fill more type cases
-                    if (type == nnfusion::element::f32)
-                        return DT_FLOAT;
-                    throw std::runtime_error("Not supported element type.");
-                }
+                // inline int get_type_id(nnfusion::element::Type type)
+                // {
+                //     // TODO: fill more type cases
+                //     if (type == nnfusion::element::f32)
+                //         return DT_FLOAT;
+                //     throw std::runtime_error("Not supported element type.");
+                // }
 
                 template <class T>
                 inline std::shared_ptr<T> get_op_object(std::shared_ptr<GNode>& curr)
diff --git a/test/nnfusion/kernels/batch_test.cpp b/test/nnfusion/kernels/batch_test.cpp
index 024bc4257..00bb1d983 100644
--- a/test/nnfusion/kernels/batch_test.cpp
+++ b/test/nnfusion/kernels/batch_test.cpp
@@ -66,7 +66,7 @@ namespace nnfusion
         ///\todo Maybe a better/general way
 
         template <typename T, typename val_t = float>
-        bool check_kernels(NNFusion_DeviceType dev_t, DataType data_t)
+        bool check_kernels(NNFusion_DeviceType dev_t, element::Type data_t)
         {
             for (int case_id = 0;; case_id++)
             {
@@ -92,255 +92,255 @@ namespace nnfusion
 ///param: node, device_type, data_type ... etc
 TEST(nnfusion_core_kernels, batch_kernel_tests_abs)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Abs>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Abs>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Abs>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Abs>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_add)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Add>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Add>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Add>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Add>(CUDA_GPU, element::f32));
 }
 
 /* TODO: arg type is bool, enable if bool data type is supported, the test case data type should also be modified 
 TEST(nnfusion_core_kernels, batch_kernel_tests_and)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::And>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::And>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::And>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::And>(CUDA_GPU, element::f32));
 }
 */
 
 /* TODO: arg index type is i32/i64, enable if more data type is supported, the test case data type should also be modified 
 TEST(nnfusion_core_kernels, batch_kernel_tests_arg_max)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::ArgMax>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::ArgMax>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::ArgMax>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::ArgMax>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_arg_min)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::ArgMin>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::ArgMin>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::ArgMin>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::ArgMin>(CUDA_GPU, element::f32));
 }
 */
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_broadcast)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Broadcast>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Broadcast>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Broadcast>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Broadcast>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_ceiling)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Ceiling>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Ceiling>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Ceiling>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Ceiling>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_concat)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Concat>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Concat>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Concat>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Concat>(CUDA_GPU, element::f32));
 }
 
 /* TODO: enable if more data type is supported, the test case data type should also be modified 
 TEST(nnfusion_core_kernels, batch_kernel_tests_convert)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Convert>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Convert>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Convert>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Convert>(CUDA_GPU, element::f32));
 }
 */
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_divide)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Divide>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Divide>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Divide>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Divide>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_dot)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Dot>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Dot>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Dot>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Dot>(CUDA_GPU, element::f32));
 }
 
 /* TODO: return type is bool, enable if bool data type is supported, the test case data type should also be modified 
 TEST(nnfusion_core_kernels, batch_kernel_tests_equal)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Equal>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Equal>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Equal>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Equal>(CUDA_GPU, element::f32));
 }
 */
 TEST(nnfusion_core_kernels, batch_kernel_tests_floor)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Floor>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Floor>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Floor>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Floor>(CUDA_GPU, element::f32));
 }
 
 /* TODO: return type is bool, enable if bool data type is supported, the test case data type should also be modified 
 TEST(nnfusion_core_kernels, batch_kernel_tests_greater)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Greater>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Greater>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Greater>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Greater>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_greater_eq)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::GreaterEq>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::GreaterEq>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::GreaterEq>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::GreaterEq>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_less)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Less>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Less>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Less>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Less>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_less_eq)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::LessEq>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::LessEq>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::LessEq>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::LessEq>(CUDA_GPU, element::f32));
 }
 */
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_max)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Max>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Max>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Max>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Max>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_max_pool)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::MaxPool>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::MaxPool>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::MaxPool>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::MaxPool>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_maximum)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Maximum>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Maximum>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Maximum>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Maximum>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_min)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Min>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Min>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Min>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Min>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_minimum)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Minimum>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Minimum>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Minimum>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Minimum>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_multiply)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Multiply>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Multiply>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Multiply>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Multiply>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_negative)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Negative>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Negative>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Negative>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Negative>(CUDA_GPU, element::f32));
 }
 
 /* TODO: enable if more data type is supported, the test case data type should also be modified 
 TEST(nnfusion_core_kernels, batch_kernel_tests_not)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Not>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Not>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Not>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Not>(CUDA_GPU, element::f32));
 }
 */
 /* TODO: return type is bool, enable if bool data type is supported, the test case data type should also be modified 
 TEST(nnfusion_core_kernels, batch_kernel_tests_not_equal)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::NotEqual>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::NotEqual>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::NotEqual>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::NotEqual>(CUDA_GPU, element::f32));
 }
 */
 /* TODO: enable if bool data type is supported, the test case data type should also be modified 
 TEST(nnfusion_core_kernels, batch_kernel_tests_or)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Or>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Or>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Or>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Or>(CUDA_GPU, element::f32));
 }
 */
 TEST(nnfusion_core_kernels, batch_kernel_tests_pad)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Pad>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Pad>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Pad>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Pad>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_product)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Product>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Product>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Product>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Product>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_relu)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Relu>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Relu>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Relu>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Relu>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_relu_backprop)
 {
     // TODO: there is no cpu kernel implemented
-    // EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::ReluBackprop>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::ReluBackprop>(CUDA_GPU, DT_FLOAT));
+    // EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::ReluBackprop>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::ReluBackprop>(CUDA_GPU, element::f32));
 }
 
 /* TODO: there is no replace slice kernel implemented
 TEST(nnfusion_core_kernels, batch_kernel_tests_replace_slice)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::ReplaceSlice>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::ReplaceSlice>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::ReplaceSlice>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::ReplaceSlice>(CUDA_GPU, element::f32));
 }
 */
 TEST(nnfusion_core_kernels, batch_kernel_tests_reshape)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Reshape>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Reshape>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Reshape>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Reshape>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_reverse)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Reverse>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Reverse>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Reverse>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Reverse>(CUDA_GPU, element::f32));
 }
 
 /* TODO: enable if bool data type is supported, the test case data type should also be modified 
 TEST(nnfusion_core_kernels, batch_kernel_tests_select)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Select>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Select>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Select>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Select>(CUDA_GPU, element::f32));
 }
 */
 TEST(nnfusion_core_kernels, batch_kernel_tests_sign)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Sign>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Sign>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Sign>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Sign>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_slice)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Slice>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Slice>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Slice>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Slice>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_sqrt)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Sqrt>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Sqrt>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Sqrt>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Sqrt>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_subtract)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Subtract>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Subtract>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Subtract>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Subtract>(CUDA_GPU, element::f32));
 }
 
 TEST(nnfusion_core_kernels, batch_kernel_tests_sum)
 {
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Sum>(GENERIC_CPU, DT_FLOAT));
-    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Sum>(CUDA_GPU, DT_FLOAT));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Sum>(GENERIC_CPU, element::f32));
+    EXPECT_TRUE(nnfusion::test::check_kernels<nnfusion::op::Sum>(CUDA_GPU, element::f32));
 }
\ No newline at end of file
diff --git a/thirdparty/ngraph/src/nnfusion/common/type_info.cpp b/thirdparty/ngraph/src/nnfusion/common/type_info.cpp
index 12d730860..d52c14193 100644
--- a/thirdparty/ngraph/src/nnfusion/common/type_info.cpp
+++ b/thirdparty/ngraph/src/nnfusion/common/type_info.cpp
@@ -18,6 +18,7 @@
 
 const nnfusion::TypeInfo::TypeDispatch nnfusion::TypeInfo::dispatcher{
     {"char", std::make_shared<nnfusion::TypeInfo_Impl<char>>()},
+    {"half", std::make_shared<nnfusion::TypeInfo_Impl<element::half>>()},
     {"float", std::make_shared<nnfusion::TypeInfo_Impl<float>>()},
     {"double", std::make_shared<nnfusion::TypeInfo_Impl<double>>()},
     {"int8_t", std::make_shared<nnfusion::TypeInfo_Impl<int8_t>>()},
diff --git a/thirdparty/ngraph/src/nnfusion/common/type_info.hpp b/thirdparty/ngraph/src/nnfusion/common/type_info.hpp
index 7c4a1c079..ff3858751 100644
--- a/thirdparty/ngraph/src/nnfusion/common/type_info.hpp
+++ b/thirdparty/ngraph/src/nnfusion/common/type_info.hpp
@@ -72,18 +72,18 @@ namespace nnfusion
         std::string max() const override { return to_string<T>(std::numeric_limits<T>::max()); }
     };
 
-    enum DataType
-    {
-        DT_FLOAT,
-        DT_DOUBLE,
-        DT_INT8,
-        DT_INT16,
-        DT_INT32,
-        DT_INT64,
-        DT_UINT8,
-        DT_UINT16,
-        DT_UINT32,
-        DT_UINT64,
-        DT_CHAR,
-    };
+    // enum DataType
+    // {
+    //     DT_FLOAT,
+    //     DT_DOUBLE,
+    //     DT_INT8,
+    //     DT_INT16,
+    //     DT_INT32,
+    //     DT_INT64,
+    //     DT_UINT8,
+    //     DT_UINT16,
+    //     DT_UINT32,
+    //     DT_UINT64,
+    //     DT_CHAR,
+    // };
 }
\ No newline at end of file

From c91a70c682c3ed70ae7b63b443f9333a4c4311df Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Fri, 27 Nov 2020 09:28:11 +0000
Subject: [PATCH 08/32] code sytle applied

---
 .../core/kernels/cpu/eigen/concat.cpp         |   2 +-
 .../core/kernels/cpu/eigen/convolution.cpp    |   2 +-
 src/nnfusion/core/kernels/cpu/eigen/dot.cpp   |   2 +-
 .../core/kernels/cpu/eigen/elementwise.cpp    |   7 +-
 src/nnfusion/core/kernels/cpu/eigen/lstm.cpp  |   2 +-
 .../core/kernels/cpu/eigen/max_pool.cpp       |   2 +-
 src/nnfusion/core/kernels/cpu/eigen/pad.cpp   |   4 +-
 .../core/kernels/cpu/eigen/reduce.cpp         |   7 +-
 .../core/kernels/cpu/eigen/softmax.cpp        |   4 +-
 .../core/kernels/cpu/general/anyop.cpp        |   4 +-
 .../core/kernels/cpu/general/reshape.cpp      |   7 +-
 src/nnfusion/core/kernels/cpu/mkl/dot.cpp     |   2 +-
 .../core/kernels/cpu/mlas/avg_pool.cpp        |   4 +-
 .../core/kernels/cpu/mlas/batch_matmul.cpp    |   2 +-
 .../core/kernels/cpu/mlas/convolution.cpp     |   4 +-
 src/nnfusion/core/kernels/cpu/mlas/dot.cpp    |   2 +-
 .../core/kernels/cpu/mlas/max_pool.cpp        |   4 +-
 .../kernels/cpu/reference/batch_matmul.cpp    |   4 +-
 .../core/kernels/cpu/reference/constant.cpp   |   4 +-
 .../core/kernels/cpu/reference/kernels.cpp    | 186 +++++++++---------
 .../core/kernels/cpu/reference/one_hot.cpp    |   4 +-
 .../core/kernels/cpu/reference/reduce_all.cpp |   4 +-
 .../kernels/cpu/reference/stop_gradient.cpp   |   4 +-
 .../core/kernels/cpu/reference/transpose.cpp  |   4 +-
 .../core/kernels/cpu/reference/variable.cpp   |   4 +-
 .../core/kernels/cpu/simd/elementwise.cpp     |   7 +-
 .../kernels/cpu/simd/elementwise_fused.cpp    |   2 +-
 .../core/kernels/cuda_gpu/cuda_cudnn.cpp      |   7 +-
 .../core/kernels/cuda_gpu/cuda_cudnn.hpp      |   6 +-
 .../core/kernels/cuda_gpu/cuda_langunit.cpp   |   8 +-
 .../cuda_gpu/inl/generate_kernel_code-inl.hpp |   4 +-
 .../core/kernels/cuda_gpu/kernels/addn.cpp    |   7 +-
 .../kernels/cuda_gpu/kernels/allreduce.cpp    |   4 +-
 .../core/kernels/cuda_gpu/kernels/anyop.cpp   |   4 +-
 .../kernels/cuda_gpu/kernels/apply_adam.cpp   |   7 +-
 .../kernels/apply_gradient_descent.cpp        |  14 +-
 .../cuda_gpu/kernels/apply_momentum.cpp       |   7 +-
 .../core/kernels/cuda_gpu/kernels/assign.cpp  |   7 +-
 .../kernels/cuda_gpu/kernels/assign_sub.cpp   |   7 +-
 .../kernels/cuda_gpu/kernels/avg_pool.cpp     |  24 +--
 .../kernels/cuda_gpu/kernels/avg_pool.hpp     |   4 +-
 .../kernels/cuda_gpu/kernels/batch_matmul.cpp |   8 +-
 .../kernels/cuda_gpu/kernels/batch_norm.cpp   |  23 ++-
 .../cuda_gpu/kernels/blockfusion_fused.cpp    |   2 +-
 .../kernels/cuda_gpu/kernels/broadcast.cpp    |   8 +-
 .../core/kernels/cuda_gpu/kernels/concat.cpp  |   8 +-
 .../cuda_gpu/kernels/concat_offset.cpp        |   4 +-
 .../kernels/cuda_gpu/kernels/constant.cpp     |   4 +-
 .../kernels/cuda_gpu/kernels/convolution.cpp  |  28 +--
 .../cuda_gpu/kernels/depthwise_conv2d.cpp     |   4 +-
 .../core/kernels/cuda_gpu/kernels/dot.cpp     | 155 ++++++++-------
 .../core/kernels/cuda_gpu/kernels/dropout.cpp |   8 +-
 .../cuda_gpu/kernels/dynamic_stitch.cpp       |   4 +-
 .../kernels/cuda_gpu/kernels/elementwise.cpp  |   2 +-
 .../cuda_gpu/kernels/elementwise_fused.cpp    |   2 +-
 .../kernels/cuda_gpu/kernels/gather_1d.cpp    |   8 +-
 .../kernels/cuda_gpu/kernels/gather_nd.cpp    |   8 +-
 .../cuda_gpu/kernels/invert_permutation.cpp   |   5 +-
 .../kernels/cuda_gpu/kernels/layer_norm.cpp   |   4 +-
 .../kernels/cuda_gpu/kernels/max_pool.cpp     |  19 +-
 .../core/kernels/cuda_gpu/kernels/one_hot.cpp |   4 +-
 .../core/kernels/cuda_gpu/kernels/pad.cpp     |   4 +-
 .../core/kernels/cuda_gpu/kernels/range.cpp   |   4 +-
 .../core/kernels/cuda_gpu/kernels/reduce.cpp  |  54 ++---
 .../kernels/cuda_gpu/kernels/reduce_all.cpp   |   4 +-
 .../core/kernels/cuda_gpu/kernels/reshape.cpp |  16 +-
 .../core/kernels/cuda_gpu/kernels/result.cpp  |   4 +-
 .../core/kernels/cuda_gpu/kernels/reverse.cpp |   4 +-
 .../cuda_gpu/kernels/reverse_sequence.cpp     |   8 +-
 .../kernels/rocm/batch_gemm_fixed.cpp         |   4 +-
 .../cuda_gpu/kernels/rocm/broadcast_host.cpp  |   4 +-
 .../kernels/rocm/broadcast_kernel.cpp         |   4 +-
 .../cuda_gpu/kernels/rocm/convfwd_fixed.cpp   |   4 +-
 .../cuda_gpu/kernels/rocm/convolution.cpp     |   4 +-
 .../cuda_gpu/kernels/rocm/gemm_fixed.cpp      |   4 +-
 .../cuda_gpu/kernels/rocm/reduce_sum.cpp      |   2 +-
 .../kernels/cuda_gpu/kernels/rocm/softmax.cpp |   4 +-
 .../core/kernels/cuda_gpu/kernels/scatter.cpp |   7 +-
 .../core/kernels/cuda_gpu/kernels/slice.cpp   |   4 +-
 .../core/kernels/cuda_gpu/kernels/softmax.cpp |   6 +-
 .../kernels/sparse_apply_momentum.cpp         |   7 +-
 .../cuda_gpu/kernels/stop_gradient.cpp        |   4 +-
 .../cuda_gpu/kernels/strided_slice_grad.cpp   |   4 +-
 .../core/kernels/cuda_gpu/kernels/tile.cpp    |  11 +-
 .../kernels/cuda_gpu/kernels/transpose.cpp    |   4 +-
 .../cuda_gpu/kernels/unsorted_segment_sum.cpp |   7 +-
 .../kernels/cuda_gpu/kernels/variable.cpp     |   4 +-
 .../core/kernels/cuda_gpu/kernels/zeros.cpp   |   4 +-
 .../core/kernels/kernel_registration.cpp      |   2 +-
 .../core/operators/generic_op/generic_op.hpp  |   2 +-
 .../engine/pass/codegen/cuda_codegen_pass.cpp |   4 +-
 .../batchnorm_inference_folding_pass.cpp      |   4 +-
 .../engine/pass/graph/kernel_selection.cpp    |   9 +-
 src/nnfusion/engine/profiler/profiler.cpp     |   3 +-
 .../frontend/tensorflow_import/ops/const.cpp  |  60 ++++--
 test/nnfusion/engine/profiler.cpp             |   3 +-
 test/nnfusion/kernels/sample.cpp              |   3 +-
 97 files changed, 523 insertions(+), 459 deletions(-)
 mode change 100755 => 100644 src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp

diff --git a/src/nnfusion/core/kernels/cpu/eigen/concat.cpp b/src/nnfusion/core/kernels/cpu/eigen/concat.cpp
index f7e3df997..435cc262a 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/concat.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/concat.cpp
@@ -209,6 +209,6 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
-    "Concat",                                                              // op_name
+    "Concat",                                                                  // op_name
     Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs
     cpu::ConcatEigen)
diff --git a/src/nnfusion/core/kernels/cpu/eigen/convolution.cpp b/src/nnfusion/core/kernels/cpu/eigen/convolution.cpp
index 8bda08757..dec874ce2 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/convolution.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/convolution.cpp
@@ -143,6 +143,6 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
-    "Convolution",                                                         // op_name
+    "Convolution",                                                             // op_name
     Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs
     cpu::ConvolutionEigen)
diff --git a/src/nnfusion/core/kernels/cpu/eigen/dot.cpp b/src/nnfusion/core/kernels/cpu/eigen/dot.cpp
index 4a04d623b..9a1ae81e1 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/dot.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/dot.cpp
@@ -89,6 +89,6 @@ LanguageUnit_p cpu::Dot::emit_dependency()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "Dot",                                                                 // op_name
+    "Dot",                                                                     // op_name
     Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs
     cpu::Dot)
diff --git a/src/nnfusion/core/kernels/cpu/eigen/elementwise.cpp b/src/nnfusion/core/kernels/cpu/eigen/elementwise.cpp
index bfc79215f..9418aeb5d 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/elementwise.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/elementwise.cpp
@@ -7,9 +7,10 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 #define REGISTER_EW_KERNEL(OP_NAME)                                                                \
-    REGISTER_KERNEL_EMITTER("" #OP_NAME "",                                                        \
-                            Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), \
-                            cpu::ElementwiseEigen<nnfusion::op::OP_NAME>);
+    REGISTER_KERNEL_EMITTER(                                                                       \
+        "" #OP_NAME "",                                                                            \
+        Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4),                 \
+        cpu::ElementwiseEigen<nnfusion::op::OP_NAME>);
 
 REGISTER_EW_KERNEL(Abs)
 REGISTER_EW_KERNEL(Acos)
diff --git a/src/nnfusion/core/kernels/cpu/eigen/lstm.cpp b/src/nnfusion/core/kernels/cpu/eigen/lstm.cpp
index 8c3ed6046..2675a6cde 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/lstm.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/lstm.cpp
@@ -144,7 +144,7 @@ LanguageUnit_p cpu::LstmEigen::emit_dependency()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "Lstm",                                                                // op_name
+    "Lstm",                                                                    // op_name
     Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs
     cpu::LstmEigen)
 
diff --git a/src/nnfusion/core/kernels/cpu/eigen/max_pool.cpp b/src/nnfusion/core/kernels/cpu/eigen/max_pool.cpp
index cc55948d7..cd1959a44 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/max_pool.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/max_pool.cpp
@@ -168,6 +168,6 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
-    "MaxPool",                                                             // op_name
+    "MaxPool",                                                                 // op_name
     Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs
     cpu::MaxPoolEigen)
diff --git a/src/nnfusion/core/kernels/cpu/eigen/pad.cpp b/src/nnfusion/core/kernels/cpu/eigen/pad.cpp
index aa82543a8..3010aa488 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/pad.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/pad.cpp
@@ -7,6 +7,6 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
-    "Pad",                                                                 // op_name
+    "Pad",                                                                     // op_name
     Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs
-    cpu::Pad<float>)                                                       // constructor
+    cpu::Pad<float>)                                                           // constructor
diff --git a/src/nnfusion/core/kernels/cpu/eigen/reduce.cpp b/src/nnfusion/core/kernels/cpu/eigen/reduce.cpp
index 86065f66c..bd09ef09f 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/reduce.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/reduce.cpp
@@ -7,9 +7,10 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 #define REGISTER_EW_KERNEL(OP_NAME)                                                                \
-    REGISTER_KERNEL_EMITTER("" #OP_NAME "",                                                        \
-                            Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), \
-                            cpu::ReduceEigen<nnfusion::op::OP_NAME>);
+    REGISTER_KERNEL_EMITTER(                                                                       \
+        "" #OP_NAME "",                                                                            \
+        Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4),                 \
+        cpu::ReduceEigen<nnfusion::op::OP_NAME>);
 
 //REGISTER_EW_KERNEL(Sum)
 //REGISTER_EW_KERNEL(Product)
diff --git a/src/nnfusion/core/kernels/cpu/eigen/softmax.cpp b/src/nnfusion/core/kernels/cpu/eigen/softmax.cpp
index cb7a99800..692b6904a 100644
--- a/src/nnfusion/core/kernels/cpu/eigen/softmax.cpp
+++ b/src/nnfusion/core/kernels/cpu/eigen/softmax.cpp
@@ -7,6 +7,6 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
-    "Softmax",                                                             // op_name
+    "Softmax",                                                                 // op_name
     Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("eigen").Priority(4), // attrs
-    cpu::SoftmaxEigen<float>)                                              // constructor
+    cpu::SoftmaxEigen<float>)                                                  // constructor
diff --git a/src/nnfusion/core/kernels/cpu/general/anyop.cpp b/src/nnfusion/core/kernels/cpu/general/anyop.cpp
index de5620391..8bbd9489c 100644
--- a/src/nnfusion/core/kernels/cpu/general/anyop.cpp
+++ b/src/nnfusion/core/kernels/cpu/general/anyop.cpp
@@ -35,6 +35,6 @@ LanguageUnit_p cpu::AnyOP::emit_dependency()
 
 // Register Pad kernel emitter
 
-REGISTER_KERNEL_EMITTER("AnyOP",                                                  //op_name
+REGISTER_KERNEL_EMITTER("AnyOP",                                                      //op_name
                         Device(GENERIC_CPU).TypeConstraint(element::f32).Priority(2), //attrs
-                        cpu::AnyOP)                                               // constructor
+                        cpu::AnyOP)                                                   // constructor
diff --git a/src/nnfusion/core/kernels/cpu/general/reshape.cpp b/src/nnfusion/core/kernels/cpu/general/reshape.cpp
index f8e694d00..9d54d418c 100644
--- a/src/nnfusion/core/kernels/cpu/general/reshape.cpp
+++ b/src/nnfusion/core/kernels/cpu/general/reshape.cpp
@@ -95,6 +95,7 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("Reshape", //op_name
-                        Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("cpu").Priority(2), //attrs
-                        cpu::ReshapeMemcpy) //constructor
+REGISTER_KERNEL_EMITTER(
+    "Reshape",                                                               //op_name
+    Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("cpu").Priority(2), //attrs
+    cpu::ReshapeMemcpy)                                                      //constructor
diff --git a/src/nnfusion/core/kernels/cpu/mkl/dot.cpp b/src/nnfusion/core/kernels/cpu/mkl/dot.cpp
index eb4e77fa4..d5230fc83 100644
--- a/src/nnfusion/core/kernels/cpu/mkl/dot.cpp
+++ b/src/nnfusion/core/kernels/cpu/mkl/dot.cpp
@@ -177,6 +177,6 @@ LanguageUnit_p cpu::DotMkl::emit_dependency()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "Dot",                                                               // op_name
+    "Dot",                                                                   // op_name
     Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mkl").Priority(3), // attrs
     cpu::DotMkl)
diff --git a/src/nnfusion/core/kernels/cpu/mlas/avg_pool.cpp b/src/nnfusion/core/kernels/cpu/mlas/avg_pool.cpp
index 68f8c484e..a8b9efac4 100644
--- a/src/nnfusion/core/kernels/cpu/mlas/avg_pool.cpp
+++ b/src/nnfusion/core/kernels/cpu/mlas/avg_pool.cpp
@@ -108,6 +108,6 @@ LanguageUnit_p cpu::AvgPoolMlas::emit_dependency()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "AvgPool",                                                            // op_name
+    "AvgPool",                                                                // op_name
     Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs
-    cpu::AvgPoolMlas)                                                     // constructor
+    cpu::AvgPoolMlas)                                                         // constructor
diff --git a/src/nnfusion/core/kernels/cpu/mlas/batch_matmul.cpp b/src/nnfusion/core/kernels/cpu/mlas/batch_matmul.cpp
index bcd21a959..3d07d1a15 100644
--- a/src/nnfusion/core/kernels/cpu/mlas/batch_matmul.cpp
+++ b/src/nnfusion/core/kernels/cpu/mlas/batch_matmul.cpp
@@ -114,6 +114,6 @@ LanguageUnit_p cpu::BatchMatMulMlas::emit_dependency()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "BatchMatMul",                                                        // op_name
+    "BatchMatMul",                                                            // op_name
     Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs
     cpu::BatchMatMulMlas)
diff --git a/src/nnfusion/core/kernels/cpu/mlas/convolution.cpp b/src/nnfusion/core/kernels/cpu/mlas/convolution.cpp
index 48f635f5a..699f6354f 100644
--- a/src/nnfusion/core/kernels/cpu/mlas/convolution.cpp
+++ b/src/nnfusion/core/kernels/cpu/mlas/convolution.cpp
@@ -184,6 +184,6 @@ LanguageUnit_p cpu::ConvolutionMlas::emit_dependency()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "Convolution",                                                        // op_name
+    "Convolution",                                                            // op_name
     Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs
-    cpu::ConvolutionMlas)                                                 // constructor
+    cpu::ConvolutionMlas)                                                     // constructor
diff --git a/src/nnfusion/core/kernels/cpu/mlas/dot.cpp b/src/nnfusion/core/kernels/cpu/mlas/dot.cpp
index 37ae88445..2dc3177a9 100644
--- a/src/nnfusion/core/kernels/cpu/mlas/dot.cpp
+++ b/src/nnfusion/core/kernels/cpu/mlas/dot.cpp
@@ -111,6 +111,6 @@ LanguageUnit_p cpu::DotMlas::emit_dependency()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "Dot",                                                                // op_name
+    "Dot",                                                                    // op_name
     Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs
     cpu::DotMlas)
diff --git a/src/nnfusion/core/kernels/cpu/mlas/max_pool.cpp b/src/nnfusion/core/kernels/cpu/mlas/max_pool.cpp
index 85e2bc94b..7738e3065 100644
--- a/src/nnfusion/core/kernels/cpu/mlas/max_pool.cpp
+++ b/src/nnfusion/core/kernels/cpu/mlas/max_pool.cpp
@@ -110,6 +110,6 @@ LanguageUnit_p cpu::MaxPoolMlas::emit_dependency()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "MaxPool",                                                            // op_name
+    "MaxPool",                                                                // op_name
     Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("mlas").Priority(6), // attrs
-    cpu::MaxPoolMlas)                                                     // constructor
+    cpu::MaxPoolMlas)                                                         // constructor
diff --git a/src/nnfusion/core/kernels/cpu/reference/batch_matmul.cpp b/src/nnfusion/core/kernels/cpu/reference/batch_matmul.cpp
index bbff5a1bd..ddbe1e076 100644
--- a/src/nnfusion/core/kernels/cpu/reference/batch_matmul.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/batch_matmul.cpp
@@ -113,9 +113,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "BatchMatMul",                                                 // op_name
+                "BatchMatMul",                                                     // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                BatchMatMulRef)                                                // constructor
+                BatchMatMulRef)                                                    // constructor
 
         } // namespace cpu
     }     // namespace kernels
diff --git a/src/nnfusion/core/kernels/cpu/reference/constant.cpp b/src/nnfusion/core/kernels/cpu/reference/constant.cpp
index e7094a52f..7917d10ad 100644
--- a/src/nnfusion/core/kernels/cpu/reference/constant.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/constant.cpp
@@ -69,6 +69,6 @@ namespace nnfusion
 
 using namespace nnfusion;
 using namespace nnfusion::kernels;
-REGISTER_KERNEL_EMITTER("Constant",                                   //op_name
+REGISTER_KERNEL_EMITTER("Constant",                                       //op_name
                         Device(GENERIC_CPU).TypeConstraint(element::f32), //attrs
-                        cpu::Constant)                                // constructor
\ No newline at end of file
+                        cpu::Constant)                                    // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cpu/reference/kernels.cpp b/src/nnfusion/core/kernels/cpu/reference/kernels.cpp
index 896fa385e..9b9afd156 100644
--- a/src/nnfusion/core/kernels/cpu/reference/kernels.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/kernels.cpp
@@ -2136,9 +2136,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Abs",                                                         // op_name
+                "Abs",                                                             // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                AbsRef)                                                        // constructor
+                AbsRef)                                                            // constructor
 
             class AcosRef : public KernelEmitter
             {
@@ -2174,9 +2174,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Acos",                                                        // op_name
+                "Acos",                                                            // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                AcosRef)                                                       // constructor
+                AcosRef)                                                           // constructor
 
             class AddRef : public KernelEmitter
             {
@@ -2212,9 +2212,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Add",                                                         // op_name
+                "Add",                                                             // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                AddRef)                                                        // constructor
+                AddRef)                                                            // constructor
 
             class AllReduceRef : public KernelEmitter
             {
@@ -2251,9 +2251,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "AllReduce",                                                   // op_name
+                "AllReduce",                                                       // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                AllReduceRef)                                                  // constructor
+                AllReduceRef)                                                      // constructor
 
             class AsinRef : public KernelEmitter
             {
@@ -2289,9 +2289,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Asin",                                                        // op_name
+                "Asin",                                                            // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                AsinRef)                                                       // constructor
+                AsinRef)                                                           // constructor
 
             class AtanRef : public KernelEmitter
             {
@@ -2327,9 +2327,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Atan",                                                        // op_name
+                "Atan",                                                            // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                AtanRef)                                                       // constructor
+                AtanRef)                                                           // constructor
 
             class BroadcastRef : public KernelEmitter
             {
@@ -2367,9 +2367,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Broadcast",                                                   // op_name
+                "Broadcast",                                                       // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                BroadcastRef)                                                  // constructor
+                BroadcastRef)                                                      // constructor
 
             class CeilingRef : public KernelEmitter
             {
@@ -2405,9 +2405,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Ceiling",                                                     // op_name
+                "Ceiling",                                                         // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                CeilingRef)                                                    // constructor
+                CeilingRef)                                                        // constructor
 
             class ConcatRef : public KernelEmitter
             {
@@ -2452,9 +2452,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Concat",                                                      // op_name
+                "Concat",                                                          // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                ConcatRef)                                                     // constructor
+                ConcatRef)                                                         // constructor
 
             /*
             class ConstantRef : public KernelEmitter
@@ -2529,9 +2529,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Convert",                                                     // op_name
+                "Convert",                                                         // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                ConvertRef)                                                    // constructor
+                ConvertRef)                                                        // constructor
 
             class ConvolutionRef : public KernelEmitter
             {
@@ -2574,9 +2574,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Convolution",                                                 // op_name
+                "Convolution",                                                     // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                ConvolutionRef)                                                // constructor
+                ConvolutionRef)                                                    // constructor
 
             class CosRef : public KernelEmitter
             {
@@ -2612,9 +2612,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Cos",                                                         // op_name
+                "Cos",                                                             // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                CosRef)                                                        // constructor
+                CosRef)                                                            // constructor
 
             class CoshRef : public KernelEmitter
             {
@@ -2650,9 +2650,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Cosh",                                                        // op_name
+                "Cosh",                                                            // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                CoshRef)                                                       // constructor
+                CoshRef)                                                           // constructor
 
             class DivideRef : public KernelEmitter
             {
@@ -2688,9 +2688,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Divide",                                                      // op_name
+                "Divide",                                                          // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                DivideRef)                                                     // constructor
+                DivideRef)                                                         // constructor
 
             class EqualRef : public KernelEmitter
             {
@@ -2726,9 +2726,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Equal",                                                       // op_name
+                "Equal",                                                           // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                EqualRef)                                                      // constructor
+                EqualRef)                                                          // constructor
 
             class ExpRef : public KernelEmitter
             {
@@ -2764,9 +2764,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Exp",                                                         // op_name
+                "Exp",                                                             // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                ExpRef)                                                        // constructor
+                ExpRef)                                                            // constructor
 
             class FloorRef : public KernelEmitter
             {
@@ -2802,9 +2802,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Floor",                                                       // op_name
+                "Floor",                                                           // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                FloorRef)                                                      // constructor
+                FloorRef)                                                          // constructor
 
             class GreaterRef : public KernelEmitter
             {
@@ -2840,9 +2840,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Greater",                                                     // op_name
+                "Greater",                                                         // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                GreaterRef)                                                    // constructor
+                GreaterRef)                                                        // constructor
 
             class LessRef : public KernelEmitter
             {
@@ -2878,9 +2878,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Less",                                                        // op_name
+                "Less",                                                            // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                LessRef)                                                       // constructor
+                LessRef)                                                           // constructor
 
             class LogRef : public KernelEmitter
             {
@@ -2916,9 +2916,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Log",                                                         // op_name
+                "Log",                                                             // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                LogRef)                                                        // constructor
+                LogRef)                                                            // constructor
 
             class LRNRef : public KernelEmitter
             {
@@ -2956,9 +2956,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "LRN",                                                         // op_name
+                "LRN",                                                             // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                LRNRef)                                                        // constructor
+                LRNRef)                                                            // constructor
 
             class MaxRef : public KernelEmitter
             {
@@ -2996,9 +2996,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Max",                                                         // op_name
+                "Max",                                                             // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                MaxRef)                                                        // constructor
+                MaxRef)                                                            // constructor
 
             class MaximumRef : public KernelEmitter
             {
@@ -3034,9 +3034,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Maximum",                                                     // op_name
+                "Maximum",                                                         // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                MaximumRef)                                                    // constructor
+                MaximumRef)                                                        // constructor
 
             class MinRef : public KernelEmitter
             {
@@ -3074,9 +3074,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Min",                                                         // op_name
+                "Min",                                                             // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                MinRef)                                                        // constructor
+                MinRef)                                                            // constructor
 
             class MinimumRef : public KernelEmitter
             {
@@ -3112,9 +3112,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Minimum",                                                     // op_name
+                "Minimum",                                                         // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                MinimumRef)                                                    // constructor
+                MinimumRef)                                                        // constructor
 
             class MultiplyRef : public KernelEmitter
             {
@@ -3150,9 +3150,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Multiply",                                                    // op_name
+                "Multiply",                                                        // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                MultiplyRef)                                                   // constructor
+                MultiplyRef)                                                       // constructor
 
             class NegativeRef : public KernelEmitter
             {
@@ -3188,9 +3188,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Negative",                                                    // op_name
+                "Negative",                                                        // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                NegativeRef)                                                   // constructor
+                NegativeRef)                                                       // constructor
 
             class PowerRef : public KernelEmitter
             {
@@ -3226,9 +3226,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Power",                                                       // op_name
+                "Power",                                                           // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                PowerRef)                                                      // constructor
+                PowerRef)                                                          // constructor
 
             class ProductRef : public KernelEmitter
             {
@@ -3266,9 +3266,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Product",                                                     // op_name
+                "Product",                                                         // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                ProductRef)                                                    // constructor
+                ProductRef)                                                        // constructor
 
             class ReluRef : public KernelEmitter
             {
@@ -3304,9 +3304,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Relu",                                                        // op_name
+                "Relu",                                                            // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                ReluRef)                                                       // constructor
+                ReluRef)                                                           // constructor
 
             class SelectRef : public KernelEmitter
             {
@@ -3343,9 +3343,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Select",                                                      // op_name
+                "Select",                                                          // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                SelectRef)                                                     // constructor
+                SelectRef)                                                         // constructor
 
             class SigmoidRef : public KernelEmitter
             {
@@ -3381,9 +3381,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Sigmoid",                                                     // op_name
+                "Sigmoid",                                                         // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                SigmoidRef)                                                    // constructor
+                SigmoidRef)                                                        // constructor
 
             class SignRef : public KernelEmitter
             {
@@ -3419,9 +3419,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Sign",                                                        // op_name
+                "Sign",                                                            // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                SignRef)                                                       // constructor
+                SignRef)                                                           // constructor
 
             class SinRef : public KernelEmitter
             {
@@ -3457,9 +3457,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Sin",                                                         // op_name
+                "Sin",                                                             // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                SinRef)                                                        // constructor
+                SinRef)                                                            // constructor
 
             class SinhRef : public KernelEmitter
             {
@@ -3495,9 +3495,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Sinh",                                                        // op_name
+                "Sinh",                                                            // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                SinhRef)                                                       // constructor
+                SinhRef)                                                           // constructor
 
             class SliceRef : public KernelEmitter
             {
@@ -3536,9 +3536,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Slice",                                                       // op_name
+                "Slice",                                                           // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                SliceRef)                                                      // constructor
+                SliceRef)                                                          // constructor
 
             class SoftmaxRef : public KernelEmitter
             {
@@ -3580,9 +3580,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Softmax",                                                     // op_name
+                "Softmax",                                                         // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                SoftmaxRef)                                                    // constructor
+                SoftmaxRef)                                                        // constructor
 
             class SqrtRef : public KernelEmitter
             {
@@ -3618,9 +3618,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Sqrt",                                                        // op_name
+                "Sqrt",                                                            // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                SqrtRef)                                                       // constructor
+                SqrtRef)                                                           // constructor
 
             class SubtractRef : public KernelEmitter
             {
@@ -3656,9 +3656,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Subtract",                                                    // op_name
+                "Subtract",                                                        // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                SubtractRef)                                                   // constructor
+                SubtractRef)                                                       // constructor
 
             class SumRef : public KernelEmitter
             {
@@ -3696,9 +3696,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Sum",                                                         // op_name
+                "Sum",                                                             // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                SumRef)                                                        // constructor
+                SumRef)                                                            // constructor
 
             class TanRef : public KernelEmitter
             {
@@ -3734,9 +3734,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Tan",                                                         // op_name
+                "Tan",                                                             // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                TanRef)                                                        // constructor
+                TanRef)                                                            // constructor
 
             class TanhRef : public KernelEmitter
             {
@@ -3772,9 +3772,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Tanh",                                                        // op_name
+                "Tanh",                                                            // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                TanhRef)                                                       // constructor
+                TanhRef)                                                           // constructor
 
             class BatchNormRef : public KernelEmitter
             {
@@ -3811,7 +3811,7 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "BatchNormInference",                                          // op_name
+                "BatchNormInference",                                              // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 BatchNormRef)
 
@@ -3855,7 +3855,7 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "AvgPool",                                                     // op_name
+                "AvgPool",                                                         // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 AvgPoolRef)
 
@@ -3896,7 +3896,7 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Dot",                                                         // op_name
+                "Dot",                                                             // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 DotRef)
 
@@ -3939,7 +3939,7 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "MaxPool",                                                     // op_name
+                "MaxPool",                                                         // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 MaxPoolRef)
 
@@ -3981,7 +3981,7 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Pad",                                                         // op_name
+                "Pad",                                                             // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 PadRef)
 
@@ -4021,7 +4021,7 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Reshape",                                                     // op_name
+                "Reshape",                                                         // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 ReshapeRef)
 
@@ -4067,7 +4067,7 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Result",                                                      // op_name
+                "Result",                                                          // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 ResultRef)
 
@@ -4105,7 +4105,7 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "LessEq",                                                      // op_name
+                "LessEq",                                                          // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 LessEqRef)
 
@@ -4145,7 +4145,7 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Reverse",                                                     // op_name
+                "Reverse",                                                         // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
                 ReverseRef)
 
diff --git a/src/nnfusion/core/kernels/cpu/reference/one_hot.cpp b/src/nnfusion/core/kernels/cpu/reference/one_hot.cpp
index ddaa06613..769d0b0cf 100644
--- a/src/nnfusion/core/kernels/cpu/reference/one_hot.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/one_hot.cpp
@@ -68,9 +68,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "OneHot",                                                      // op_name
+                "OneHot",                                                          // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                OneHotRef)                                                     // constructor
+                OneHotRef)                                                         // constructor
 
         } // namespace cpu
     }     // namespace kernels
diff --git a/src/nnfusion/core/kernels/cpu/reference/reduce_all.cpp b/src/nnfusion/core/kernels/cpu/reference/reduce_all.cpp
index 20362bfad..2fea9d3c0 100644
--- a/src/nnfusion/core/kernels/cpu/reference/reduce_all.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/reduce_all.cpp
@@ -61,9 +61,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "All",                                                         // op_name
+                "All",                                                             // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                AllRef)                                                        // constructor
+                AllRef)                                                            // constructor
 
         } // namespace cpu
     }     // namespace kernels
diff --git a/src/nnfusion/core/kernels/cpu/reference/stop_gradient.cpp b/src/nnfusion/core/kernels/cpu/reference/stop_gradient.cpp
index 531cea738..7678ef56b 100644
--- a/src/nnfusion/core/kernels/cpu/reference/stop_gradient.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/stop_gradient.cpp
@@ -54,9 +54,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "StopGradient",                                                // op_name
+                "StopGradient",                                                    // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                StopGradientRef)                                               // constructor
+                StopGradientRef)                                                   // constructor
 
         } // namespace cpu
     }     // namespace kernels
diff --git a/src/nnfusion/core/kernels/cpu/reference/transpose.cpp b/src/nnfusion/core/kernels/cpu/reference/transpose.cpp
index 4b9508899..cd487a7c0 100644
--- a/src/nnfusion/core/kernels/cpu/reference/transpose.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/transpose.cpp
@@ -113,9 +113,9 @@ namespace nnfusion
             };
 
             REGISTER_KERNEL_EMITTER(
-                "Transpose",                                                   // op_name
+                "Transpose",                                                       // op_name
                 Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("reference"), // attrs
-                TransposeRef)                                                  // constructor
+                TransposeRef)                                                      // constructor
 
         } // namespace cpu
     }     // namespace kernels
diff --git a/src/nnfusion/core/kernels/cpu/reference/variable.cpp b/src/nnfusion/core/kernels/cpu/reference/variable.cpp
index 2b8bf9c0c..5e16388f6 100644
--- a/src/nnfusion/core/kernels/cpu/reference/variable.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/variable.cpp
@@ -67,6 +67,6 @@ namespace nnfusion
 
 using namespace nnfusion;
 using namespace nnfusion::kernels;
-REGISTER_KERNEL_EMITTER("Variable",                                   //op_name
+REGISTER_KERNEL_EMITTER("Variable",                                       //op_name
                         Device(GENERIC_CPU).TypeConstraint(element::f32), //attrs
-                        cpu::Variable)                                // constructor
\ No newline at end of file
+                        cpu::Variable)                                    // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cpu/simd/elementwise.cpp b/src/nnfusion/core/kernels/cpu/simd/elementwise.cpp
index cd725f542..05120d0b3 100644
--- a/src/nnfusion/core/kernels/cpu/simd/elementwise.cpp
+++ b/src/nnfusion/core/kernels/cpu/simd/elementwise.cpp
@@ -7,9 +7,10 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 #define REGISTER_EW_KERNEL(OP_NAME)                                                                \
-    REGISTER_KERNEL_EMITTER("" #OP_NAME "",                                                        \
-                            Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("simd").Priority(5),  \
-                            cpu::ElementwiseSimd<nnfusion::op::OP_NAME>);
+    REGISTER_KERNEL_EMITTER(                                                                       \
+        "" #OP_NAME "",                                                                            \
+        Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("simd").Priority(5),                  \
+        cpu::ElementwiseSimd<nnfusion::op::OP_NAME>);
 
 REGISTER_EW_KERNEL(Abs)
 REGISTER_EW_KERNEL(Ceiling)
diff --git a/src/nnfusion/core/kernels/cpu/simd/elementwise_fused.cpp b/src/nnfusion/core/kernels/cpu/simd/elementwise_fused.cpp
index 1423244bf..c94b31b06 100644
--- a/src/nnfusion/core/kernels/cpu/simd/elementwise_fused.cpp
+++ b/src/nnfusion/core/kernels/cpu/simd/elementwise_fused.cpp
@@ -438,6 +438,6 @@ LanguageUnit_p ElementwiseFused::emit_comments()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "ElementwiseFused",                                                   // op_name
+    "ElementwiseFused",                                                       // op_name
     Device(GENERIC_CPU).TypeConstraint(element::f32).Tag("simd").Priority(5), // attrs
     cpu::ElementwiseFused)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp
index f8891b025..548e1e4e0 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.cpp
@@ -31,7 +31,9 @@ std::string cuda::get_cudnn_datatype(element::Type dtype)
     return p->second;
 }
 
-LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape, string desc, element::Type type)
+LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape,
+                                                        string desc,
+                                                        element::Type type)
 {
     LanguageUnit_p _lu(new LanguageUnit);
     auto& lu = *_lu;
@@ -92,7 +94,8 @@ LanguageUnit_p cuda::cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& s
     return _lu;
 }
 
-LanguageUnit_p cuda::get_cudnn_filter_descriptor(const Shape& shape, string desc, element::Type type)
+LanguageUnit_p
+    cuda::get_cudnn_filter_descriptor(const Shape& shape, string desc, element::Type type)
 {
     LanguageUnit_p _lu(new LanguageUnit);
     auto& lu = *_lu;
diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp
index fc8cdd8dc..3c5a1e013 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_cudnn.hpp
@@ -16,13 +16,15 @@ namespace nnfusion
             std::string get_cudnn_datatype(element::Type type);
             LanguageUnit_p cudnn_tensor_descriptor_from_shape(const nnfusion::Shape& shape,
                                                               string desc,
-                                                              element::Type type = element::f32);
+                                                              element::Type type);
             LanguageUnit_p get_cudnn_convolution_descriptor(const Shape& padding,
                                                             const Strides& window_movement_strides,
                                                             const Strides& window_dilation_strides,
                                                             string desc,
                                                             element::Type type = element::f32);
-            LanguageUnit_p get_cudnn_filter_descriptor(const Shape& shape, string desc, element::Type type = element::f32);
+            LanguageUnit_p get_cudnn_filter_descriptor(const Shape& shape,
+                                                       string desc,
+                                                       element::Type type = element::f32);
             LanguageUnit_p get_dropout_global_states(float ratio);
             inline std::string ratio2str(float ratio)
             {
diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
index 565c18c49..ac8b9e90a 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
@@ -253,9 +253,8 @@ __device__ __forceinline__ int64_t  load(const int64_t*  __restrict__ in, int i=
 }
 )");
 
-LU_DEFINE(
-  declaration::cuda_fp16_scale,
-  R"(
+LU_DEFINE(declaration::cuda_fp16_scale,
+          R"(
 __global__ void nnfusionHalfScaleKernel(half *x, half *alpha, size_t count)
 {
     size_t offset = threadIdx.x + blockIdx.x * blockDim.x;
@@ -270,8 +269,7 @@ void nnfusionHalfScale(half *x, half *alpha, size_t len)
 {
     nnfusionHalfScaleKernel<<<(len+255)/256, 256>>>(x, alpha, len);
 }
-  )"
-)
+  )")
 
 LU_DEFINE_EXTEND(declaration::cuda_reduce_primitive,
                  R"(
diff --git a/src/nnfusion/core/kernels/cuda_gpu/inl/generate_kernel_code-inl.hpp b/src/nnfusion/core/kernels/cuda_gpu/inl/generate_kernel_code-inl.hpp
index 9ef0d123c..7d9c2131d 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/inl/generate_kernel_code-inl.hpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/inl/generate_kernel_code-inl.hpp
@@ -64,6 +64,6 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER(__KernelOpType__,                                             // op_name
+REGISTER_KERNEL_EMITTER(__KernelOpType__,                                                 // op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), // attrs
-                        cuda::__KernelUniqueClassName__)                              // constructor
+                        cuda::__KernelUniqueClassName__) // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/addn.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/addn.cpp
index b19c6d2ae..9972df1af 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/addn.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/addn.cpp
@@ -89,6 +89,7 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("AddN",
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-                        cuda::AddN)
+REGISTER_KERNEL_EMITTER(
+    "AddN",
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
+    cuda::AddN)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/allreduce.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/allreduce.cpp
index 525968ab2..383c9b600 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/allreduce.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/allreduce.cpp
@@ -49,6 +49,6 @@ namespace nnfusion
 
 using namespace nnfusion;
 using namespace nnfusion::kernels;
-REGISTER_KERNEL_EMITTER("AllReduce",                                           //op_name
+REGISTER_KERNEL_EMITTER("AllReduce",                                               //op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs
-                        cuda::SuperScalerAllReduce)                            // constructor
+                        cuda::SuperScalerAllReduce)                                // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/anyop.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/anyop.cpp
index 1adc0952c..2bf5882e7 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/anyop.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/anyop.cpp
@@ -34,6 +34,6 @@ LanguageUnit_p cuda::AnyOP::emit_dependency()
 
 // Register Pad kernel emitter
 
-REGISTER_KERNEL_EMITTER("AnyOP",                                               //op_name
+REGISTER_KERNEL_EMITTER("AnyOP",                                                   //op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs
-                        cuda::AnyOP)                                           // constructor
+                        cuda::AnyOP)                                               // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp
index 566e1b535..e42e0eda9 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp
@@ -105,6 +105,7 @@ if(i == 0)
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("ApplyAdam",
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-                        cuda::ApplyAdam)
\ No newline at end of file
+REGISTER_KERNEL_EMITTER(
+    "ApplyAdam",
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
+    cuda::ApplyAdam)
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_gradient_descent.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_gradient_descent.cpp
index 05dcc1087..908502b41 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_gradient_descent.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_gradient_descent.cpp
@@ -75,9 +75,11 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("ApplyGradient",
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-                        cuda::ApplyGradientDescent)
-REGISTER_KERNEL_EMITTER("ApplyGradientDescent",
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-                        cuda::ApplyGradientDescent)
+REGISTER_KERNEL_EMITTER(
+    "ApplyGradient",
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
+    cuda::ApplyGradientDescent)
+REGISTER_KERNEL_EMITTER(
+    "ApplyGradientDescent",
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
+    cuda::ApplyGradientDescent)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_momentum.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_momentum.cpp
index 72716bd4e..a506653f1 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_momentum.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_momentum.cpp
@@ -79,6 +79,7 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("ApplyMomentum",
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-                        cuda::ApplyMomentum)
+REGISTER_KERNEL_EMITTER(
+    "ApplyMomentum",
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
+    cuda::ApplyMomentum)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/assign.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/assign.cpp
index f22ba886b..914b2f7cb 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/assign.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/assign.cpp
@@ -75,6 +75,7 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("Assign",
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-                        cuda::Assign)
+REGISTER_KERNEL_EMITTER(
+    "Assign",
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
+    cuda::Assign)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/assign_sub.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/assign_sub.cpp
index d21de903d..4e380a74f 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/assign_sub.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/assign_sub.cpp
@@ -75,6 +75,7 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("AssignSub",
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-                        cuda::AssignSub)
+REGISTER_KERNEL_EMITTER(
+    "AssignSub",
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
+    cuda::AssignSub)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp
index bc2c56b1b..479156586 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.cpp
@@ -50,8 +50,8 @@ cuda::AvgPool1D::AvgPool1D(shared_ptr<KernelContext> ctx)
     padding_above = nnfusion::Shape(avg_pool->get_padding_above());
     window_stride = nnfusion::Strides(avg_pool->get_window_movement_strides());
     include_pad = avg_pool->get_include_padding_in_avg_computation();
-    input_type = ctx->inputs[0]->get_element_type().c_type_string();
-    output_type = ctx->outputs[0]->get_element_type().c_type_string();
+    input_type = ctx->inputs[0]->get_element_type();
+    output_type = ctx->outputs[0]->get_element_type();
 
     // NNFUSION_CHECK(input_shape.size() == 3)
     //     << "Input shape size of AvgPool1D is invalid, shape size: " << input_shape.size()
@@ -265,16 +265,16 @@ cuda::AvgPoolmD::AvgPoolmD(shared_ptr<KernelContext> ctx)
     padding_above = nnfusion::Shape(avg_pool->get_padding_above());
     window_stride = nnfusion::Strides(avg_pool->get_window_movement_strides());
     include_pad = avg_pool->get_include_padding_in_avg_computation();
-    input_type = ctx->inputs[0]->get_element_type().c_type_string();
-    output_type = ctx->outputs[0]->get_element_type().c_type_string();
+    input_type = ctx->inputs[0]->get_element_type();
+    output_type = ctx->outputs[0]->get_element_type();
 
     NNFUSION_CHECK(input_shape.size() == 4 || input_shape.size() == 5)
         << "Input shape size of AvgPoolmD is invalid, shape size: " << input_shape.size()
         << "expected 4 or 5";
 
     std::stringstream tag;
-    tag << "cudnn_avgpool_dtype_" << output_type << "_i" << join(input_shape, "_") << "_o"
-        << join(output_shape, "_") << "_ws" << join(window_stride, "_") << "_wst"
+    tag << "cudnn_avgpool_dtype_" << output_type.c_type_string() << "_i" << join(input_shape, "_")
+        << "_o" << join(output_shape, "_") << "_ws" << join(window_stride, "_") << "_wst"
         << join(window_stride, "_") << "_pb" << join(padding_below, "_") << "_pb"
         << join(padding_above, "_");
     custom_tag = tag.str();
@@ -288,8 +288,8 @@ LanguageUnit_p cuda::AvgPoolmD::emit_function_body()
     auto cudnn_avg_type = include_pad ? "CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING"
                                       : "CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING";
 
-    auto input_desc = cudnn_tensor_descriptor_from_shape(input_shape, "input_desc");
-    auto output_desc = cudnn_tensor_descriptor_from_shape(output_shape, "output_desc");
+    auto input_desc = cudnn_tensor_descriptor_from_shape(input_shape, "input_desc", input_type);
+    auto output_desc = cudnn_tensor_descriptor_from_shape(output_shape, "output_desc", output_type);
     lu << input_desc->get_code();
     lu << output_desc->get_code();
 
@@ -404,11 +404,11 @@ LanguageUnit_p cuda::AvgPoolmD::emit_function_signature()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "AvgPool",                                                                // op_name
+    "AvgPool",                                                                    // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::AvgPool1D)                                                          // constructor
+    cuda::AvgPool1D)                                                              // constructor
 
 REGISTER_KERNEL_EMITTER(
-    "AvgPool",                                                                 // op_name
+    "AvgPool",                                                                     // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs
-    cuda::AvgPoolmD)                                                           // constructor
+    cuda::AvgPoolmD)                                                               // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.hpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.hpp
index da1a55717..68c45e233 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.hpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/avg_pool.hpp
@@ -54,7 +54,7 @@ namespace nnfusion
                     padding_above;
                 nnfusion::Strides window_stride;
                 bool include_pad;
-                string input_type, output_type;
+                element::Type input_type, output_type;
 
                 // Precompute for fast constant memory access.
                 int HW, DHW, CDHW, PQ, MPQ, KMPQ, RS, TRS;
@@ -78,7 +78,7 @@ namespace nnfusion
                     padding_above;
                 nnfusion::Strides window_stride;
                 bool include_pad;
-                string input_type, output_type;
+                element::Type input_type, output_type;
             };
         } // namespace cuda
     }     // namespace kernels
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp
index ad81ec3d8..173e95e93 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp
@@ -192,11 +192,11 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
-    "BatchMatMul",                                                            // op_name
+    "BatchMatMul",                                                                // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::BatchMatMul)                                                        // constructor
+    cuda::BatchMatMul)                                                            // constructor
 
 REGISTER_KERNEL_EMITTER(
-    "BatchMatMul",                                                            // op_name
+    "BatchMatMul",                                                                // op_name
     Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::BatchMatMul)                                                        // constructor
+    cuda::BatchMatMul)                                                            // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp
index 6a64bee78..050a27a58 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_norm.cpp
@@ -28,7 +28,7 @@ LanguageUnit_p cuda::BatchNorm::emit_function_body()
 {
     LanguageUnit_p _lu(new LanguageUnit(get_function_name()));
     auto& lu = *_lu;
-    auto tensor_desc = cudnn_tensor_descriptor_from_shape(tensor_shape, "tensor_desc");
+    auto tensor_desc = cudnn_tensor_descriptor_from_shape(tensor_shape, "tensor_desc", dtype);
     lu << tensor_desc->get_code();
     // derived_param_desc
     lu << "cudnnTensorDescriptor_t derived_param_desc;\n";
@@ -201,12 +201,15 @@ void cuda::BatchNormNCHW::set_launch_config()
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("BatchNormInference", // op_name
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn").Priority(2), // attrs
-                        cuda::BatchNorm)      // constructor
-REGISTER_KERNEL_EMITTER("BatchNormInference", // op_name
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda").Priority(2), // attrs
-                        cuda::BatchNormNCHW)  // constructor
-REGISTER_KERNEL_EMITTER("BatchNormInference", // op_name
-                        Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda").Priority(2), // attrs
-                        cuda::BatchNormNCHW) // constructor
+REGISTER_KERNEL_EMITTER(
+    "BatchNormInference",                                                   // op_name
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn").Priority(2), // attrs
+    cuda::BatchNorm)                                                        // constructor
+REGISTER_KERNEL_EMITTER(
+    "BatchNormInference",                                                  // op_name
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda").Priority(2), // attrs
+    cuda::BatchNormNCHW)                                                   // constructor
+REGISTER_KERNEL_EMITTER(
+    "BatchNormInference",                                                  // op_name
+    Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda").Priority(2), // attrs
+    cuda::BatchNormNCHW)                                                   // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/blockfusion_fused.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/blockfusion_fused.cpp
index 22cea1999..877b84477 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/blockfusion_fused.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/blockfusion_fused.cpp
@@ -71,6 +71,6 @@ void BlockFusionFused::set_launch_config()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "BlockFusionFused",                                                       // op_name
+    "BlockFusionFused",                                                           // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::BlockFusionFused)
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/broadcast.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/broadcast.cpp
index 08660ed41..f839422c7 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/broadcast.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/broadcast.cpp
@@ -194,10 +194,10 @@ namespace nnfusion
 
 using namespace nnfusion;
 using namespace nnfusion::kernels;
-REGISTER_KERNEL_EMITTER("Broadcast",                                           //op_name
+REGISTER_KERNEL_EMITTER("Broadcast",                                               //op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs
-                        cuda::Broadcast)                                       // constructor
+                        cuda::Broadcast)                                           // constructor
 
-REGISTER_KERNEL_EMITTER("Broadcast",                                           //op_name
+REGISTER_KERNEL_EMITTER("Broadcast",                                               //op_name
                         Device(ROCM_GPU).TypeConstraint(element::f32).Priority(2), //attrs
-                        cuda::RocmBroadcast)                                   // constructor
+                        cuda::RocmBroadcast)                                       // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/concat.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/concat.cpp
index 3b39408e0..6d7791f61 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/concat.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/concat.cpp
@@ -324,9 +324,9 @@ namespace nnfusion
 
 using namespace nnfusion;
 using namespace nnfusion::kernels;
-REGISTER_KERNEL_EMITTER("Concat",                                  //op_name
+REGISTER_KERNEL_EMITTER("Concat",                                      //op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32), //attrs
-                        cuda::Concat)                              // constructor
+                        cuda::Concat)                                  // constructor
 
 namespace nnfusion
 {
@@ -479,6 +479,6 @@ namespace nnfusion
     }     // namespace kernels
 } // namespace nnfusion
 
-REGISTER_KERNEL_EMITTER("Concat",                                  //op_name
+REGISTER_KERNEL_EMITTER("Concat",                                      //op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32), //attrs
-                        cuda::ConcatKernel)                        // constructor
+                        cuda::ConcatKernel)                            // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/concat_offset.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/concat_offset.cpp
index 586091381..2c812666b 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/concat_offset.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/concat_offset.cpp
@@ -73,6 +73,6 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
-    "ConcatOffset",                                                           // op_name
+    "ConcatOffset",                                                               // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::ConcatOffset)                                                       // constructor
+    cuda::ConcatOffset)                                                           // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp
index d6e0d9000..035b00bb7 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp
@@ -119,6 +119,6 @@ namespace nnfusion
 
 using namespace nnfusion;
 using namespace nnfusion::kernels;
-REGISTER_KERNEL_EMITTER("Constant",                                            //op_name
+REGISTER_KERNEL_EMITTER("Constant",                                                //op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs
-                        cuda::Constant)                                        // constructor
\ No newline at end of file
+                        cuda::Constant)                                            // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp
index 7ddcb0426..6b91e3956 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp
@@ -15,8 +15,10 @@ cuda::ConvolutionCudnn::ConvolutionCudnn(shared_ptr<KernelContext> ctx)
     input_type = ctx->inputs[0]->get_element_type();
     filter_type = ctx->inputs[1]->get_element_type();
     output_type = ctx->outputs[0]->get_element_type();
-    NNFUSION_CHECK(input_type == filter_type && input_type == output_type) 
-        << "Convolution input datatype (" << input_type << ") should be the same with that of filter (" << filter_type << "), and that of output (" << output_type << ").";
+    NNFUSION_CHECK(input_type == filter_type && input_type == output_type)
+        << "Convolution input datatype (" << input_type
+        << ") should be the same with that of filter (" << filter_type << "), and that of output ("
+        << output_type << ").";
     conv_type = input_type;
     input_shape = ctx->inputs[0]->get_shape();
     filter_shape = ctx->inputs[1]->get_shape();
@@ -85,14 +87,18 @@ LanguageUnit_p cuda::ConvolutionCudnn::emit_function_body()
         padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
     }
 
-
     {
         // lu << "cudnnDataType_t data_type = " << get_cudnn_datatype(dtype) << ";\n";
-        lu << cudnn_tensor_descriptor_from_shape(input_shape, "tensor_desc_0", input_type)->get_code();
-        lu << cudnn_tensor_descriptor_from_shape(output_shape, "tensor_desc_1", output_type)->get_code();
+        lu << cudnn_tensor_descriptor_from_shape(input_shape, "tensor_desc_0", input_type)
+                  ->get_code();
+        lu << cudnn_tensor_descriptor_from_shape(output_shape, "tensor_desc_1", output_type)
+                  ->get_code();
         lu << get_cudnn_filter_descriptor(filter_shape, "filter_desc", filter_type)->get_code();
-        lu << get_cudnn_convolution_descriptor(
-                  padding_below, window_movement_strides, window_dilation_strides, "conv_desc", conv_type)
+        lu << get_cudnn_convolution_descriptor(padding_below,
+                                               window_movement_strides,
+                                               window_dilation_strides,
+                                               "conv_desc",
+                                               conv_type)
                   ->get_code();
 
         lu << R"(
@@ -213,11 +219,11 @@ LanguageUnit_p cuda::ConvolutionCudnn::emit_function_signature()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "Convolution",                                                             // op_name
+    "Convolution",                                                                 // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs
-    cuda::ConvolutionCudnn)                                                    // constructor
+    cuda::ConvolutionCudnn)                                                        // constructor
 
 REGISTER_KERNEL_EMITTER(
-    "Convolution",                                                             // op_name
+    "Convolution",                                                                 // op_name
     Device(CUDA_GPU).TypeConstraint(element::f16).Tag("cudnn_kernel").Priority(2), // attrs
-    cuda::ConvolutionCudnn)                                                    // constructor
+    cuda::ConvolutionCudnn)                                                        // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp
index 2b1dce365..d4ccbead1 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/depthwise_conv2d.cpp
@@ -372,6 +372,6 @@ LanguageUnit_p cuda::DepthwiseConv2dNative::emit_dependency()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "DepthwiseConv2dNative",                                                  // op_name
+    "DepthwiseConv2dNative",                                                      // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::DepthwiseConv2dNative)                                              // constructor
+    cuda::DepthwiseConv2dNative)                                                  // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
index 2f07207c3..7a9bef553 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
@@ -203,7 +203,9 @@ LanguageUnit_p cuda::Dot::emit_function_body()
                << " static_cast<float*>(output0),"
                << " " << n << "));\n";
         }
-    } else if (dtype == element::f16) {
+    }
+    else if (dtype == element::f16)
+    {
         // case 1: Scalar * Tensor
         // if (arg0_shape.empty() || arg1_shape.empty())
         // {
@@ -282,86 +284,87 @@ LanguageUnit_p cuda::Dot::emit_function_body()
         //     << " static_cast<half*>(output0),"
         //     << " " << m << "));\n";
         // } else {
-            size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
-            size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
-            size_t axes_for_k_count = reduction_axes;
-            size_t m = 1;
-            size_t n = 1;
-            size_t k = 1;
-
-            // check if input and output size correct
-            // check and calculate k for arg0 and arg1
-            size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
-            size_t arg1_k_idx = 0;                // first axe in arg1 for k
-
-            for (size_t i = 0; i < axes_for_k_count; i++)
+        size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
+        size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
+        size_t axes_for_k_count = reduction_axes;
+        size_t m = 1;
+        size_t n = 1;
+        size_t k = 1;
+
+        // check if input and output size correct
+        // check and calculate k for arg0 and arg1
+        size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
+        size_t arg1_k_idx = 0;                // first axe in arg1 for k
+
+        for (size_t i = 0; i < axes_for_k_count; i++)
+        {
+            k *= arg0_shape[arg0_k_idx];
+            if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
             {
-                k *= arg0_shape[arg0_k_idx];
-                if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
-                {
-                    std::vector<std::string> arg_vec{"arg0", "arg1"};
-                    std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
+                std::vector<std::string> arg_vec{"arg0", "arg1"};
+                std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
 
-                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                            << nnfusion::join(shape_vec) << " respectively, at Node "
-                                            << m_context->gnode->get_name()
-                                            << ", do not match for dot op";
-                }
+                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                      << nnfusion::join(shape_vec) << " respectively, at Node "
+                                      << m_context->gnode->get_name()
+                                      << ", do not match for dot op";
             }
-            // check and calculate m for arg0 and out
-            size_t arg0_m_idx = 0; // first axe in arg0 for m
-            size_t out_m_idx = 0;  // first axe in out for m
-            for (size_t i = 0; i < axes_for_m_count; i++)
+        }
+        // check and calculate m for arg0 and out
+        size_t arg0_m_idx = 0; // first axe in arg0 for m
+        size_t out_m_idx = 0;  // first axe in out for m
+        for (size_t i = 0; i < axes_for_m_count; i++)
+        {
+            m *= arg0_shape[arg0_m_idx];
+            if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
             {
-                m *= arg0_shape[arg0_m_idx];
-                if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
-                {
-                    std::vector<std::string> arg_vec{"arg0", "output"};
-                    std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
+                std::vector<std::string> arg_vec{"arg0", "output"};
+                std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
 
-                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                            << nnfusion::join(shape_vec) << " respectively, at Node "
-                                            << m_context->gnode->get_name()
-                                            << ", do not match for dot op";
-                }
+                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                      << nnfusion::join(shape_vec) << " respectively, at Node "
+                                      << m_context->gnode->get_name()
+                                      << ", do not match for dot op";
             }
-            // check and calculate n for arg1 and out
-            size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
-            size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
-            for (size_t i = 0; i < axes_for_n_count; i++)
+        }
+        // check and calculate n for arg1 and out
+        size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
+        size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
+        for (size_t i = 0; i < axes_for_n_count; i++)
+        {
+            n *= arg1_shape[arg1_n_idx];
+            if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
             {
-                n *= arg1_shape[arg1_n_idx];
-                if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
-                {
-                    std::vector<std::string> arg_vec{"arg1", "output"};
-                    std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
+                std::vector<std::string> arg_vec{"arg1", "output"};
+                std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
 
-                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                            << nnfusion::join(shape_vec) << " respectively, at Node "
-                                            << m_context->gnode->get_name()
-                                            << ", do not match for dot op";
-                }
+                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                      << nnfusion::join(shape_vec) << " respectively, at Node "
+                                      << m_context->gnode->get_name()
+                                      << ", do not match for dot op";
             }
+        }
 
-            lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
-
-            lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
-                << " CUBLAS_OP_N,"
-                << " CUBLAS_OP_N,"
-                << " " << n << ","
-                << " " << m << ","
-                << " " << k << ","
-                << " &alpha,"
-                << " static_cast<const half*>(input1),"
-                << " " << n << ","
-                << " static_cast<const half*>(input0),"
-                << " " << k << ","
-                << " &beta,"
-                << " static_cast<half*>(output0),"
-                << " " << n << "));\n";
+        lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
+
+        lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
+           << " CUBLAS_OP_N,"
+           << " CUBLAS_OP_N,"
+           << " " << n << ","
+           << " " << m << ","
+           << " " << k << ","
+           << " &alpha,"
+           << " static_cast<const half*>(input1),"
+           << " " << n << ","
+           << " static_cast<const half*>(input0),"
+           << " " << k << ","
+           << " &beta,"
+           << " static_cast<half*>(output0),"
+           << " " << n << "));\n";
         // }
-        
-    } else {
+    }
+    else
+    {
         NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot.";
     }
     //lu.block_end();
@@ -419,16 +422,16 @@ LanguageUnit_p cuda::Dot::emit_function_signature()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "Dot",                                                               // op_name
+    "Dot",                                                                   // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs
-    cuda::Dot)                                                           // constructor
+    cuda::Dot)                                                               // constructor
 
 REGISTER_KERNEL_EMITTER(
-    "Dot",                                                               // op_name
+    "Dot",                                                                   // op_name
     Device(CUDA_GPU).TypeConstraint(element::f16).Tag("cublas").Priority(2), // attrs
-    cuda::Dot)                                                           // constructor
+    cuda::Dot)                                                               // constructor
 
 REGISTER_KERNEL_EMITTER(
-    "Dot",                                                               // op_name
+    "Dot",                                                                   // op_name
     Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs
-    cuda::Dot)                                                           // constructor
+    cuda::Dot)                                                               // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dropout.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dropout.cpp
index 25b4c51ce..96a5e976e 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dropout.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dropout.cpp
@@ -253,10 +253,10 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("DropoutTraining",                                      // op_name
+REGISTER_KERNEL_EMITTER("DropoutTraining",                                          // op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn"), // attrs
-                        cuda::DropoutTraining)                                  // constructor
+                        cuda::DropoutTraining)                                      // constructor
 
-REGISTER_KERNEL_EMITTER("DropoutTrainingGrad",                                  // op_name
+REGISTER_KERNEL_EMITTER("DropoutTrainingGrad",                                      // op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn"), // attrs
-                        cuda::DropoutTrainingGrad)                              // constructor
+                        cuda::DropoutTrainingGrad)                                  // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp
index 96385a2f2..4bd847949 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp
@@ -122,6 +122,6 @@ LanguageUnit_p cuda::DynamicStitch::emit_dependency()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "DynamicStitch",                                                          // op_name
+    "DynamicStitch",                                                              // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::DynamicStitch)                                                      // constructor
\ No newline at end of file
+    cuda::DynamicStitch)                                                          // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.cpp
index 9597d122e..4f5b2b6cc 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise.cpp
@@ -9,7 +9,7 @@ using namespace nnfusion::kernels;
 #define REGISTER_EW_KERNEL(OP_NAME)                                                                \
     REGISTER_KERNEL_EMITTER(                                                                       \
         "" #OP_NAME "",                                                                            \
-        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("element_wise").Priority(2),                 \
+        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("element_wise").Priority(2),             \
         cuda::ElementWise<nnfusion::op::OP_NAME>);
 
 REGISTER_EW_KERNEL(Abs)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp
index c4c3b0bdc..b29e0fa16 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/elementwise_fused.cpp
@@ -340,6 +340,6 @@ void ElementWiseFused::compute_best_config(int& grids, int& blocks, int& bound)
 }
 
 REGISTER_KERNEL_EMITTER(
-    "ElementWiseFused",                                                       // op_name
+    "ElementWiseFused",                                                           // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
     cuda::ElementWiseFused)
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_1d.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_1d.cpp
index 94fa0e506..8b1d4836c 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_1d.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_1d.cpp
@@ -115,9 +115,9 @@ LanguageUnit_p cuda::Gather1D::emit_dependency()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "GatherV2",                                                               // op_name
+    "GatherV2",                                                                   // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::Gather1D)                                                           // constructor
+    cuda::Gather1D)                                                               // constructor
 
 cuda::Gather1DGrad::Gather1DGrad(shared_ptr<KernelContext> ctx)
     : BlockCudaEmitter(ctx)
@@ -228,6 +228,6 @@ LanguageUnit_p cuda::Gather1DGrad::emit_dependency()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "GatherGrad",                                                             // op_name
+    "GatherGrad",                                                                 // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::Gather1DGrad)                                                       // constructor
+    cuda::Gather1DGrad)                                                           // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_nd.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_nd.cpp
index 6743691cf..4b36dba4f 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_nd.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/gather_nd.cpp
@@ -278,10 +278,10 @@ atomic_add(output0 + x_offset, __ldg(input1 + y_offset));
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("GatherND",                                                   // op_name
+REGISTER_KERNEL_EMITTER("GatherND",                                                       // op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), // attrs
-                        cuda::GatherND)                                               // constructor
+                        cuda::GatherND) // constructor
 
-REGISTER_KERNEL_EMITTER("GatherNDGrad",                                               // op_name
+REGISTER_KERNEL_EMITTER("GatherNDGrad",                                                   // op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), // attrs
-                        cuda::GatherNDGrad)                                           // constructor
+                        cuda::GatherNDGrad) // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/invert_permutation.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/invert_permutation.cpp
index 6dd9bba2e..e20dd2492 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/invert_permutation.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/invert_permutation.cpp
@@ -65,6 +65,7 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("InvertPermutation",
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Priority(
-                            2), // TODO: this op input and output will all be int
+                        Device(CUDA_GPU)
+                            .TypeConstraint(element::f32)
+                            .Priority(2), // TODO: this op input and output will all be int
                         cuda::InvertPermutation)
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/layer_norm.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/layer_norm.cpp
index bc9a6f463..5e39b3b65 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/layer_norm.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/layer_norm.cpp
@@ -77,6 +77,6 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("LayerNorm",                                              // op_name
+REGISTER_KERNEL_EMITTER("LayerNorm",                                                  // op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudalib"), // attrs
-                        cuda::LayerNorm)                                          // constructor
+                        cuda::LayerNorm)                                              // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp
index 76af4bb3e..9bce41fe7 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/max_pool.cpp
@@ -31,8 +31,8 @@ cuda::MaxPool1D::MaxPool1D(shared_ptr<KernelContext> ctx)
     //     << "expected 3";
 
     std::stringstream tag;
-    tag << "cuda_maxpool_" << input_type.c_type_string() << "_" << output_type.c_type_string() << "_iw"
-        << std::to_string(input_width) << "_ow" << std::to_string(output_width) << "_ww"
+    tag << "cuda_maxpool_" << input_type.c_type_string() << "_" << output_type.c_type_string()
+        << "_iw" << std::to_string(input_width) << "_ow" << std::to_string(output_width) << "_ww"
         << std::to_string(window_width) << "_wst" << std::to_string(window_stride_width);
     custom_tag = tag.str();
 }
@@ -53,7 +53,8 @@ LanguageUnit_p cuda::MaxPool1D::emit_function_body()
         // Index into input tensor.
         lu << "size_t start = (tid / " << output_width << ") * " << input_width << " + "
            << " (tid % " << output_width << ") * " << window_stride[0] << ";\n";
-        lu << input_type.c_type_string() << " max_val = " << TypeInfo::Get(input_type)->lowest() << ";\n";
+        lu << input_type.c_type_string() << " max_val = " << TypeInfo::Get(input_type)->lowest()
+           << ";\n";
         lu << "for (size_t i = start; i < start + " << window_width << "; i++)\n";
         lu.block_begin();
         {
@@ -108,8 +109,8 @@ cuda::MaxPoolmD::MaxPoolmD(shared_ptr<KernelContext> ctx)
     window_stride = nnfusion::Strides(max_pool->get_window_movement_strides());
 
     std::stringstream tag;
-    tag << "cudnn_maxpool_dtype_" << output_type.c_type_string() << "_i" << join(input_shape, "_") << "_o"
-        << join(output_shape, "_") << "_ws" << join(window_shape, "_") << "_wst"
+    tag << "cudnn_maxpool_dtype_" << output_type.c_type_string() << "_i" << join(input_shape, "_")
+        << "_o" << join(output_shape, "_") << "_ws" << join(window_shape, "_") << "_wst"
         << join(window_stride, "_") << "_pb" << join(padding_below, "_") << "_pb"
         << join(padding_above, "_");
     custom_tag = tag.str();
@@ -239,11 +240,11 @@ LanguageUnit_p cuda::MaxPoolmD::emit_function_signature()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "MaxPool",                                                                // op_name
+    "MaxPool",                                                                    // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::MaxPool1D)                                                          // constructor
+    cuda::MaxPool1D)                                                              // constructor
 
 REGISTER_KERNEL_EMITTER(
-    "MaxPool",                                                                 // op_name
+    "MaxPool",                                                                     // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs
-    cuda::MaxPoolmD)                                                           // constructor
+    cuda::MaxPoolmD)                                                               // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/one_hot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/one_hot.cpp
index 1368e1244..2cccc00f2 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/one_hot.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/one_hot.cpp
@@ -109,6 +109,6 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
-    "OneHot",                                                                 // op_name
+    "OneHot",                                                                     // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::OneHot)                                                             // constructor
+    cuda::OneHot)                                                                 // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp
index 0ac7149dc..faab94fe9 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp
@@ -147,6 +147,6 @@ KernelRegistrar kernel_registrar0(
 */
 
 REGISTER_KERNEL_EMITTER(
-    "Pad",                                                                    // op_name
+    "Pad",                                                                        // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::Pad)                                                                // constructor
\ No newline at end of file
+    cuda::Pad)                                                                    // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp
index 397eeb9b4..1c5a30279 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp
@@ -63,6 +63,6 @@ LanguageUnit_p cuda::Range::emit_dependency()
     return _lu;
 }
 REGISTER_KERNEL_EMITTER(
-    "Range",                                                                  // op_name
+    "Range",                                                                      // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::Range)                                                              // constructor
\ No newline at end of file
+    cuda::Range)                                                                  // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.cpp
index 15dd2d3ce..835754c46 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.cpp
@@ -6,56 +6,62 @@
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("Max",
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-                        cuda::Reduce<nnfusion::op::Max>)
+REGISTER_KERNEL_EMITTER(
+    "Max",
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
+    cuda::Reduce<nnfusion::op::Max>)
 
 REGISTER_KERNEL_EMITTER(
-    "Max",                                                                 // op_name
+    "Max",                                                                     // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
     cuda::ReduceMemcpy<nnfusion::op::Max>)
 
-REGISTER_KERNEL_EMITTER("Min",
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-                        cuda::Reduce<nnfusion::op::Min>)
+REGISTER_KERNEL_EMITTER(
+    "Min",
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
+    cuda::Reduce<nnfusion::op::Min>)
 
 REGISTER_KERNEL_EMITTER(
-    "Min",                                                                 // op_name
+    "Min",                                                                     // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
     cuda::ReduceMemcpy<nnfusion::op::Min>)
 
-REGISTER_KERNEL_EMITTER("Product",
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-                        cuda::Reduce<nnfusion::op::Multiply>)
+REGISTER_KERNEL_EMITTER(
+    "Product",
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
+    cuda::Reduce<nnfusion::op::Multiply>)
 
 REGISTER_KERNEL_EMITTER(
-    "Product",                                                             // op_name
+    "Product",                                                                 // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
     cuda::ReduceMemcpy<nnfusion::op::Multiply>)
 
-REGISTER_KERNEL_EMITTER("Sum",
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-                        cuda::Reduce<nnfusion::op::Add>)
+REGISTER_KERNEL_EMITTER(
+    "Sum",
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
+    cuda::Reduce<nnfusion::op::Add>)
 
 REGISTER_KERNEL_EMITTER(
-    "Sum",                                                                 // op_name
+    "Sum",                                                                     // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
     cuda::ReduceMemcpy<nnfusion::op::Add>)
 
-REGISTER_KERNEL_EMITTER("Sum",
-                        Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-                        cuda::Reduce<nnfusion::op::Add>)
+REGISTER_KERNEL_EMITTER(
+    "Sum",
+    Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
+    cuda::Reduce<nnfusion::op::Add>)
 
 REGISTER_KERNEL_EMITTER(
-    "Sum",                                                                 // op_name
+    "Sum",                                                                     // op_name
     Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
     cuda::ReduceMemcpy<nnfusion::op::Add>)
 
-REGISTER_KERNEL_EMITTER("ReduceAny",
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-                        cuda::Reduce<nnfusion::op::Or>)
+REGISTER_KERNEL_EMITTER(
+    "ReduceAny",
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
+    cuda::Reduce<nnfusion::op::Or>)
 
 REGISTER_KERNEL_EMITTER(
-    "ReduceAny",                                                           // op_name
+    "ReduceAny",                                                               // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
     cuda::ReduceMemcpy<nnfusion::op::Or>)
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce_all.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce_all.cpp
index 6760325ec..1298e6fbf 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce_all.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce_all.cpp
@@ -109,6 +109,6 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
-    "All",                                                                    // op_name
+    "All",                                                                        // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::All)                                                                // constructor
+    cuda::All)                                                                    // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reshape.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reshape.cpp
index 168a91011..aa9fc49e4 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reshape.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reshape.cpp
@@ -557,21 +557,21 @@ LanguageUnit_p cuda::ReshapeMemcpy::emit_function_signature()
 // Register Reshape kernel emitter
 
 REGISTER_KERNEL_EMITTER(
-    "Reshape",                                                                   // op_name
+    "Reshape",                                                                       // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel_2D").Priority(2), // attrs
-    cuda::Reshape2D)                                                             // constructor
+    cuda::Reshape2D)                                                                 // constructor
 
 REGISTER_KERNEL_EMITTER(
-    "Reshape",                                                                   // op_name
+    "Reshape",                                                                       // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel_3D").Priority(2), // attrs
-    cuda::Reshape3D)                                                             // constructor
+    cuda::Reshape3D)                                                                 // constructor
 
 REGISTER_KERNEL_EMITTER(
-    "Reshape",                                                                  // op_name
+    "Reshape",                                                                      // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel_D").Priority(2), // attrs
-    cuda::ReshapehD)                                                            // constructor
+    cuda::ReshapehD)                                                                // constructor
 
 REGISTER_KERNEL_EMITTER(
-    "Reshape",                                                             // op_name
+    "Reshape",                                                                 // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
-    cuda::ReshapeMemcpy)                                                   // constructor
+    cuda::ReshapeMemcpy)                                                       // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp
index 46e81ade7..f3429c956 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp
@@ -84,6 +84,6 @@ LanguageUnit_p cuda::Result::emit_dependency()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "Result",                                                              // op_name
+    "Result",                                                                  // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
-    cuda::Result)                                                          // constructor
\ No newline at end of file
+    cuda::Result)                                                              // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp
index 36f2d39b5..6d5fc374d 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp
@@ -101,6 +101,6 @@ LanguageUnit_p cuda::Reverse::emit_dependency()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "Reverse",                                                                // op_name
+    "Reverse",                                                                    // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::Reverse)                                                            // constructor
\ No newline at end of file
+    cuda::Reverse)                                                                // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp
index 6f0e0e2bc..487951930 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp
@@ -124,10 +124,10 @@ if (tid < @threads@) {
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER(
-    "ReverseSequence",                                                        // op_name
+    "ReverseSequence",                                                            // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::ReverseSequence)                                                    // constructor
+    cuda::ReverseSequence)                                                        // constructor
 
-REGISTER_KERNEL_EMITTER("ReverseSequence",                                     // op_name
+REGISTER_KERNEL_EMITTER("ReverseSequence",                                         // op_name
                         Device(ROCM_GPU).TypeConstraint(element::f32).Priority(2), // attrs
-                        cuda::RocmReverseSequence)                             // constructor
\ No newline at end of file
+                        cuda::RocmReverseSequence)                                 // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/batch_gemm_fixed.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/batch_gemm_fixed.cpp
index 5ed2d8ea4..de4ec59e9 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/batch_gemm_fixed.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/batch_gemm_fixed.cpp
@@ -112,6 +112,6 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("BatchMatMul",                                         // op_name
+REGISTER_KERNEL_EMITTER("BatchMatMul",                                             // op_name
                         Device(ROCM_GPU).TypeConstraint(element::f32).Priority(4), // attrs
-                        cuda::BatchGemmFixed)                                  // constructor
+                        cuda::BatchGemmFixed)                                      // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_host.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_host.cpp
index 63dd091e4..f8658791d 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_host.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_host.cpp
@@ -172,6 +172,6 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("Broadcast",                                           //op_name
+REGISTER_KERNEL_EMITTER("Broadcast",                                               //op_name
                         Device(ROCM_GPU).TypeConstraint(element::f32).Priority(3), //attrs
-                        cuda::RocmBiasBroadcast)                               // constructor
+                        cuda::RocmBiasBroadcast)                                   // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_kernel.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_kernel.cpp
index f5f8b3a50..7c2318f9c 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_kernel.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/broadcast_kernel.cpp
@@ -295,6 +295,6 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("Broadcast",                                           // op_name
+REGISTER_KERNEL_EMITTER("Broadcast",                                               // op_name
                         Device(ROCM_GPU).TypeConstraint(element::f32).Priority(4), // attrs
-                        cuda::RocmManualBroadcast)                             // constructor
+                        cuda::RocmManualBroadcast)                                 // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convfwd_fixed.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convfwd_fixed.cpp
index 04fb041b8..01bf11715 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convfwd_fixed.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convfwd_fixed.cpp
@@ -141,6 +141,6 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
-    "Convolution",                                                            // op_name
+    "Convolution",                                                                // op_name
     Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::ConvFwdFixed)                                                       // constructor
+    cuda::ConvFwdFixed)                                                           // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convolution.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convolution.cpp
index 2b19b8db0..6abefd6e6 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convolution.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/convolution.cpp
@@ -226,6 +226,6 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
-    "Convolution",                                                             // op_name
+    "Convolution",                                                                 // op_name
     Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs
-    cuda::RocmConvolutionCudnn)                                                // constructor
+    cuda::RocmConvolutionCudnn)                                                    // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/gemm_fixed.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/gemm_fixed.cpp
index 983b7fce3..1fd205a99 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/gemm_fixed.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/gemm_fixed.cpp
@@ -207,6 +207,6 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
-    "Dot",                                                                    // op_name
+    "Dot",                                                                        // op_name
     Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::GemmFixed)                                                          // constructor
+    cuda::GemmFixed)                                                              // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/reduce_sum.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/reduce_sum.cpp
index a476ddd06..d12253ba7 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/reduce_sum.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/reduce_sum.cpp
@@ -335,7 +335,7 @@ using namespace nnfusion::kernels;
 
 #define REGISTER_GPU_KERNEL(KEY, OP_NAME)                                                          \
     REGISTER_KERNEL_EMITTER(KEY,                                                                   \
-                            Device(ROCM_GPU).TypeConstraint(element::f32).Priority(4),                 \
+                            Device(ROCM_GPU).TypeConstraint(element::f32).Priority(4),             \
                             cuda::RocmReduce<nnfusion::op::OP_NAME>)
 
 REGISTER_GPU_KERNEL("Sum", Add)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/softmax.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/softmax.cpp
index a2a54782d..c10db2e19 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/softmax.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/rocm/softmax.cpp
@@ -134,6 +134,6 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
-    "Softmax",                                                                // op_name
+    "Softmax",                                                                    // op_name
     Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::RocmSoftmax)                                                        // constructor
+    cuda::RocmSoftmax)                                                            // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/scatter.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/scatter.cpp
index fb2fd5930..b88350fda 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/scatter.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/scatter.cpp
@@ -97,9 +97,10 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 #define REGISTER_SCATTER_KERNEL(OP_NAME, KERNEL_NAME)                                              \
-    REGISTER_KERNEL_EMITTER("" #KERNEL_NAME "",                                                    \
-                            Device(CUDA_GPU).TypeConstraint(element::f32).Tag("scatter").Priority(2),  \
-                            cuda::Scatter<nnfusion::op::OP_NAME>);
+    REGISTER_KERNEL_EMITTER(                                                                       \
+        "" #KERNEL_NAME "",                                                                        \
+        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("scatter").Priority(2),                  \
+        cuda::Scatter<nnfusion::op::OP_NAME>);
 
 REGISTER_SCATTER_KERNEL(Subtract, ScatterSub);
 REGISTER_SCATTER_KERNEL(Add, ScatterAdd);
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/slice.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/slice.cpp
index d5bb5910d..766255b2c 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/slice.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/slice.cpp
@@ -138,6 +138,6 @@ LanguageUnit_p cuda::Slice::emit_dependency()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "Slice",                                                                  // op_name
+    "Slice",                                                                      // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::Slice)                                                              // constructor
+    cuda::Slice)                                                                  // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp
index 0dba0aec4..c653abedc 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp
@@ -132,9 +132,9 @@ LanguageUnit_p cuda::Softmax::emit_function_signature()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "Softmax",                                                                 // op_name
+    "Softmax",                                                                     // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs
-    cuda::Softmax)                                                             // constructor
+    cuda::Softmax)                                                                 // constructor
 
 cuda::SoftmaxGrad::SoftmaxGrad(shared_ptr<KernelContext> ctx)
     : CudaLibEmitter(ctx)
@@ -266,6 +266,6 @@ LanguageUnit_p cuda::SoftmaxGrad::emit_function_signature()
 }
 
 REGISTER_KERNEL_EMITTER(
-    "SoftmaxGrad",                                                             // op_name
+    "SoftmaxGrad",                                                                 // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cudnn_kernel").Priority(2), // attrs
     cuda::SoftmaxGrad)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/sparse_apply_momentum.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/sparse_apply_momentum.cpp
index 063716257..bbfac36eb 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/sparse_apply_momentum.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/sparse_apply_momentum.cpp
@@ -126,6 +126,7 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("SparseApplyMomentum",
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-                        cuda::SparseApplyMomentum)
+REGISTER_KERNEL_EMITTER(
+    "SparseApplyMomentum",
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
+    cuda::SparseApplyMomentum)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/stop_gradient.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/stop_gradient.cpp
index 6f7192da6..00cfe33fa 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/stop_gradient.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/stop_gradient.cpp
@@ -115,6 +115,6 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
-    "StopGradient",                                                           // op_name
+    "StopGradient",                                                               // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::StopGradient)                                                       // constructor
+    cuda::StopGradient)                                                           // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp
index d796ef822..342edf949 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp
@@ -113,6 +113,6 @@ LanguageUnit_p cuda::StridedSliceGrad::emit_dependency()
     return _lu;
 }
 REGISTER_KERNEL_EMITTER(
-    "StridedSliceGrad",                                                       // op_name
+    "StridedSliceGrad",                                                           // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::StridedSliceGrad)                                                   // constructor
\ No newline at end of file
+    cuda::StridedSliceGrad)                                                       // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp
index 4b22d7c25..33a869e71 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp
@@ -146,10 +146,11 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("Tile",
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-                        cuda::Tile)
+REGISTER_KERNEL_EMITTER(
+    "Tile",
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
+    cuda::Tile)
 
-REGISTER_KERNEL_EMITTER("Tile",                                                //op_name
+REGISTER_KERNEL_EMITTER("Tile",                                                    //op_name
                         Device(ROCM_GPU).TypeConstraint(element::f32).Priority(2), //attrs
-                        cuda::RocmTile)                                        // constructor
\ No newline at end of file
+                        cuda::RocmTile)                                            // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/transpose.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/transpose.cpp
index 7016e0518..824a799cf 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/transpose.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/transpose.cpp
@@ -143,6 +143,6 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 
 REGISTER_KERNEL_EMITTER(
-    "Transpose",                                                              // op_name
+    "Transpose",                                                                  // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::Transpose)                                                          // constructor
+    cuda::Transpose)                                                              // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/unsorted_segment_sum.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/unsorted_segment_sum.cpp
index e816cd7b9..576136cd9 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/unsorted_segment_sum.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/unsorted_segment_sum.cpp
@@ -185,6 +185,7 @@ atomicAdd(output0 + output_index, input0[tid]);
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("UnsortedSegmentSum",
-                        Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-                        cuda::UnsortedSegmentSum)
+REGISTER_KERNEL_EMITTER(
+    "UnsortedSegmentSum",
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
+    cuda::UnsortedSegmentSum)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp
index 368e24241..419124649 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp
@@ -77,6 +77,6 @@ namespace nnfusion
 
 using namespace nnfusion;
 using namespace nnfusion::kernels;
-REGISTER_KERNEL_EMITTER("Variable",                                            //op_name
+REGISTER_KERNEL_EMITTER("Variable",                                                //op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs
-                        cuda::Variable)                                        // constructor
\ No newline at end of file
+                        cuda::Variable)                                            // constructor
\ No newline at end of file
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/zeros.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/zeros.cpp
index 0a457b435..0ebc875a0 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/zeros.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/zeros.cpp
@@ -58,6 +58,6 @@ namespace nnfusion
 using namespace nnfusion;
 using namespace nnfusion::kernels;
 
-REGISTER_KERNEL_EMITTER("Zeros",                                                      // op_name
+REGISTER_KERNEL_EMITTER("Zeros",                                                          // op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel"), // attrs
-                        cuda::Zeros)                                                  // constructor
+                        cuda::Zeros) // constructor
diff --git a/src/nnfusion/core/kernels/kernel_registration.cpp b/src/nnfusion/core/kernels/kernel_registration.cpp
index f18d75b7d..f527906fe 100644
--- a/src/nnfusion/core/kernels/kernel_registration.cpp
+++ b/src/nnfusion/core/kernels/kernel_registration.cpp
@@ -2,8 +2,8 @@
 // Licensed under the MIT License.
 
 #include "kernel_registration.hpp"
-#include "nnfusion/util/util.hpp"
 #include "ngraph/src/nnfusion/common/type/element_type.hpp"
+#include "nnfusion/util/util.hpp"
 
 using namespace nnfusion;
 using namespace nnfusion::kernels;
diff --git a/src/nnfusion/core/operators/generic_op/generic_op.hpp b/src/nnfusion/core/operators/generic_op/generic_op.hpp
index 2a9f36171..f61552569 100644
--- a/src/nnfusion/core/operators/generic_op/generic_op.hpp
+++ b/src/nnfusion/core/operators/generic_op/generic_op.hpp
@@ -6,8 +6,8 @@
 #include <iomanip>
 #include <limits>
 
-#include "nnfusion/common/common.hpp"
 #include "ngraph/src/nnfusion/common/type/element_type.hpp"
+#include "nnfusion/common/common.hpp"
 
 #define REGISTER_OP(op_x)                                                                          \
     static nnfusion::op::OpConfig __register_op_##op_x = nnfusion::op::build_op_config(#op_x)
diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
old mode 100755
new mode 100644
index fd1c3b456..0aa467fd2
--- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
+++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
@@ -803,8 +803,8 @@ void CudaCodegenPass::create_header_file(std::shared_ptr<InterpreterContext> ctx
     lu_header << declaration::typedef_int->get_code() << "\n";
     if (device_type() == CUDA_GPU || device_type() == ROCM_GPU)
         lu_header << header::cuda->get_code();
-        // TODO only include this if half is used
-        lu_header << header::cuda_fp16->get_code();
+    // TODO only include this if half is used
+    lu_header << header::cuda_fp16->get_code();
 
     lu_header << "extern \"C\" int kernel_entry(";
     std::string params = get_kernel_entry_paras(tu);
diff --git a/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp b/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp
index 1dc3b5521..a0dfaf000 100644
--- a/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp
+++ b/src/nnfusion/engine/pass/graph/batchnorm_inference_folding_pass.cpp
@@ -887,7 +887,7 @@ bool BatchNormInferenceFoldingPass::run_on_graph(std::shared_ptr<nnfusion::graph
     if (folding_flag)
     {
         NNFUSION_LOG(INFO) << "batchnorm inference folding Pass starts up for Graph: "
-                                       << graph->get_name();
+                           << graph->get_name();
         for (auto pattern : BN_FOLDING_PATTERNS)
         {
             BatchNormInferenceOptimizer optimizer(graph, pattern);
@@ -899,7 +899,7 @@ bool BatchNormInferenceFoldingPass::run_on_graph(std::shared_ptr<nnfusion::graph
             const_folding_optimizer.run_on_graph(graph);
         }
         NNFUSION_LOG(INFO) << "batchnorm inference folding Pass ends for Graph: "
-                                       << graph->get_name();
+                           << graph->get_name();
     }
     return true;
 }
\ No newline at end of file
diff --git a/src/nnfusion/engine/pass/graph/kernel_selection.cpp b/src/nnfusion/engine/pass/graph/kernel_selection.cpp
index 7b212ee2f..7216cacc1 100644
--- a/src/nnfusion/engine/pass/graph/kernel_selection.cpp
+++ b/src/nnfusion/engine/pass/graph/kernel_selection.cpp
@@ -26,7 +26,8 @@ pair<NNFusion_DeviceType, kernels::KernelEmitter::Pointer>
                                                  IProfilingRuntime::Pointer runtime)
 {
     std::vector<shared_ptr<const KernelRegistration>> kernel_regs =
-        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, element::f32);
+        KernelRegistry::Global()->FindKernelRegistrations(
+            gnode->get_op_type(), devtype, element::f32);
 
     // Skip since only one candidate or constant
     if (kernel_regs.size() == 1 || gnode->is_constant())
@@ -143,7 +144,8 @@ pair<NNFusion_DeviceType, kernels::KernelEmitter::Pointer>
 {
     shared_ptr<KernelContext> ctx(new KernelContext(gnode));
     std::vector<shared_ptr<const KernelRegistration>> kernel_regs =
-        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, element::f32);
+        KernelRegistry::Global()->FindKernelRegistrations(
+            gnode->get_op_type(), devtype, element::f32);
 
     if (devtype == ROCM_GPU)
     {
@@ -355,7 +357,8 @@ pair<NNFusion_DeviceType, kernels::KernelEmitter::Pointer>
                                         NNFusion_DeviceType devtype)
 {
     std::vector<shared_ptr<const KernelRegistration>> kernel_regs =
-        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), devtype, element::f32);
+        KernelRegistry::Global()->FindKernelRegistrations(
+            gnode->get_op_type(), devtype, element::f32);
     shared_ptr<KernelContext> ctx(new KernelContext(gnode));
     std::vector<std::string> functions;
 
diff --git a/src/nnfusion/engine/profiler/profiler.cpp b/src/nnfusion/engine/profiler/profiler.cpp
index 174c3c108..0f4efbfab 100644
--- a/src/nnfusion/engine/profiler/profiler.cpp
+++ b/src/nnfusion/engine/profiler/profiler.cpp
@@ -82,7 +82,8 @@ void GraphEvaluate::create_profiling_contexts(shared_ptr<GNode> gnode)
         return;
     }
     std::vector<shared_ptr<const KernelRegistration>> kernel_regs =
-        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), dev_type, element::f32);
+        KernelRegistry::Global()->FindKernelRegistrations(
+            gnode->get_op_type(), dev_type, element::f32);
     shared_ptr<KernelContext> ctx(new KernelContext(gnode));
 
     for (auto kernel_reg : kernel_regs)
diff --git a/src/nnfusion/frontend/tensorflow_import/ops/const.cpp b/src/nnfusion/frontend/tensorflow_import/ops/const.cpp
index 1d8f5e926..670303535 100644
--- a/src/nnfusion/frontend/tensorflow_import/ops/const.cpp
+++ b/src/nnfusion/frontend/tensorflow_import/ops/const.cpp
@@ -167,42 +167,62 @@ namespace nnfusion
                 // int_val, float_val, etc.
                 if (tensor_content_size == 0)
                 {
-
-#define GET_VALUES(type) do {                                               \
-                        const void* dat = nullptr;                                              \
-                        for (size_t i = 0; i < n_elements; ++i) {                               \
-                            if (tensor.type##_val_size() == 1) {                                \
-                                dat = reinterpret_cast<const void *>(&tensor.type##_val()[0]);  \
-                            } else {                                                            \
-                                dat = reinterpret_cast<const void *>(&tensor.type##_val()[i]);  \
-                            }                                                                   \
-                            values->setElement(i, dat);                                         \
-                        }                                                                       \
-                    } while(0)
+#define GET_VALUES(type)                                                                           \
+    do                                                                                             \
+    {                                                                                              \
+        const void* dat = nullptr;                                                                 \
+        for (size_t i = 0; i < n_elements; ++i)                                                    \
+        {                                                                                          \
+            if (tensor.type##_val_size() == 1)                                                     \
+            {                                                                                      \
+                dat = reinterpret_cast<const void*>(&tensor.type##_val()[0]);                      \
+            }                                                                                      \
+            else                                                                                   \
+            {                                                                                      \
+                dat = reinterpret_cast<const void*>(&tensor.type##_val()[i]);                      \
+            }                                                                                      \
+            values->setElement(i, dat);                                                            \
+        }                                                                                          \
+    } while (0)
 
                     values->resize(n_elements);
                     auto& tensor = node.attr().at("value").tensor();
                     size_t val_size;
-                    if (dt == tensorflow::DT_INT32) {
+                    if (dt == tensorflow::DT_INT32)
+                    {
                         GET_VALUES(int);
-                    } else if (dt == tensorflow::DT_INT64) {
+                    }
+                    else if (dt == tensorflow::DT_INT64)
+                    {
                         GET_VALUES(int64);
-                    } else if (dt == tensorflow::DT_BOOL) {
+                    }
+                    else if (dt == tensorflow::DT_BOOL)
+                    {
                         GET_VALUES(bool);
-                    } else if (dt == tensorflow::DT_HALF) {
+                    }
+                    else if (dt == tensorflow::DT_HALF)
+                    {
                         GET_VALUES(half);
-                    } else if (dt == tensorflow::DT_FLOAT) {
+                    }
+                    else if (dt == tensorflow::DT_FLOAT)
+                    {
                         GET_VALUES(float);
-                    } else if (dt == tensorflow::DT_DOUBLE) {
+                    }
+                    else if (dt == tensorflow::DT_DOUBLE)
+                    {
                         GET_VALUES(double);
-                    } else if (dt == tensorflow::DT_STRING) {
+                    }
+                    else if (dt == tensorflow::DT_STRING)
+                    {
                         values->resize(tensor.string_val()[0].length());
                         auto it = tensor.string_val()[0].begin();
                         for (size_t j = 0; it != tensor.string_val()[0].end(); ++j, ++it)
                         {
                             values->setElement(j, reinterpret_cast<const void*>(&it));
                         }
-                    } else {
+                    }
+                    else
+                    {
                         return false;
                     }
 
diff --git a/test/nnfusion/engine/profiler.cpp b/test/nnfusion/engine/profiler.cpp
index cffc74d1c..6709d1fdc 100644
--- a/test/nnfusion/engine/profiler.cpp
+++ b/test/nnfusion/engine/profiler.cpp
@@ -25,7 +25,8 @@ TEST(nnfusion_engine_profiler, basic_utils)
 
     // Filter out the kernels meeting the requirement;
     std::vector<shared_ptr<const KernelRegistration>> kernel_regs =
-        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), CUDA_GPU, element::f32);
+        KernelRegistry::Global()->FindKernelRegistrations(
+            gnode->get_op_type(), CUDA_GPU, element::f32);
     shared_ptr<KernelContext> ctx(new KernelContext(gnode));
 
     // Gnerate Test data
diff --git a/test/nnfusion/kernels/sample.cpp b/test/nnfusion/kernels/sample.cpp
index 367e6173c..e3cb109bf 100644
--- a/test/nnfusion/kernels/sample.cpp
+++ b/test/nnfusion/kernels/sample.cpp
@@ -24,7 +24,8 @@ TEST(nnfusion_core_kernels, sample)
 
     // Filter out the kernels meeting the requirement;
     std::vector<shared_ptr<const KernelRegistration>> kernel_regs =
-        KernelRegistry::Global()->FindKernelRegistrations(gnode->get_op_type(), CUDA_GPU, element::f32);
+        KernelRegistry::Global()->FindKernelRegistrations(
+            gnode->get_op_type(), CUDA_GPU, element::f32);
     shared_ptr<KernelContext> ctx(new KernelContext(gnode));
 
     EXPECT_GT(kernel_regs.size(), 0);

From 1ede97278beb38cfec60cf591522525ced9ea333 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Wed, 2 Dec 2020 13:46:02 +0800
Subject: [PATCH 09/32] meet master

---
 .../engine/pass/graph/kernel_selection.cpp    | 44 -------------------
 1 file changed, 44 deletions(-)

diff --git a/src/nnfusion/engine/pass/graph/kernel_selection.cpp b/src/nnfusion/engine/pass/graph/kernel_selection.cpp
index fa17819d5..cfe2992fd 100644
--- a/src/nnfusion/engine/pass/graph/kernel_selection.cpp
+++ b/src/nnfusion/engine/pass/graph/kernel_selection.cpp
@@ -419,49 +419,5 @@ bool FetchBasedSelector::run_on_graph(std::shared_ptr<nnfusion::graph::Graph>& g
         }
     }
 
-    return true;
-}
-
-bool DefaultKernelSelector::register_antares_kernel()
-{
-    for (auto pair : nnfusion::op::get_op_configs())
-    {
-        std::string op_name = pair.first;
-        std::vector<NNFusion_DeviceType> devs{CUDA_GPU, GENERIC_CPU, HLSL};
-
-        KernelRegistrar kernel_registrar_cuda(
-            op_name,
-            Name(op_name)
-                .Device(CUDA_GPU)
-                .TypeConstraint(element::f32)
-                .Tag("antares")
-                .Priority(9)
-                .KernelFactory([](shared_ptr<KernelContext> context) -> shared_ptr<KernelEmitter> {
-                    return make_shared<cuda::AntaresCudaKernelEmitter>(context);
-                })
-                .Build());
-        KernelRegistrar kernel_registrar_cpu(
-            op_name,
-            Name(op_name)
-                .Device(GENERIC_CPU)
-                .TypeConstraint(element::f32)
-                .Tag("antares")
-                .Priority(9)
-                .KernelFactory([](shared_ptr<KernelContext> context) -> shared_ptr<KernelEmitter> {
-                    return make_shared<cpu::AntaresCpuKernelEmitter>(context);
-                })
-                .Build());
-        KernelRegistrar kernel_registrar_hlsl(
-            op_name,
-            Name(op_name)
-                .Device(HLSL)
-                .TypeConstraint(element::f32)
-                .Tag("antares")
-                .Priority(9)
-                .KernelFactory([](shared_ptr<KernelContext> context) -> shared_ptr<KernelEmitter> {
-                    return make_shared<hlsl::AntaresHLSLKernelEmitter>(context);
-                })
-                .Build());
-    }
     return true;
 }
\ No newline at end of file

From a3d43f0740a1ec3c1f88f0de1930ea088ed9aa71 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Thu, 3 Dec 2020 19:48:53 +0800
Subject: [PATCH 10/32] fp16 runnable

---
 maint/script/build.sh                                      | 2 +-
 src/nnfusion/core/kernels/common_langunit.cpp              | 2 +-
 src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp       | 7 +++++++
 src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp       | 1 +
 .../operators/generic_op/generic_op_define/Convolution.cpp | 3 ++-
 .../generic_op/generic_op_define/DepthToSpace.cpp          | 2 +-
 .../generic_op/generic_op_define/DepthwiseConv2dNative.cpp | 3 ++-
 src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp     | 1 +
 src/nnfusion/engine/pass/graph/kernel_tuning.cpp           | 6 +++---
 .../frontend/tensorflow_import/util/graph_convert.cpp      | 3 +++
 .../ngraph/src/nnfusion/core/operators/op_define/fused.cpp | 2 +-
 11 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/maint/script/build.sh b/maint/script/build.sh
index 7d113d7c9..657ed780b 100755
--- a/maint/script/build.sh
+++ b/maint/script/build.sh
@@ -36,7 +36,7 @@ fi
 
 # Make
 pushd $THIS_SCRIPT_DIR/../../build/ > /dev/null
-make -j6
+make -j$(nproc)
 popd > /dev/null
 
 if [ $? -ne 0 ]; then
diff --git a/src/nnfusion/core/kernels/common_langunit.cpp b/src/nnfusion/core/kernels/common_langunit.cpp
index 0c8ed9bb0..cdf6ec226 100644
--- a/src/nnfusion/core/kernels/common_langunit.cpp
+++ b/src/nnfusion/core/kernels/common_langunit.cpp
@@ -24,7 +24,7 @@ LU_DEFINE(header::limits, "#include <limits>\n");
 
 // Macro
 LU_DEFINE(macro::NNFUSION_DEBUG, "#define NNFUSION_DEBUG\n");
-LU_DEFINE(macro::MIN, "#define MIN(a,b) ((a)>(b)?(b):(a))\n")
+LU_DEFINE(macro::MIN, "#define MIN(a,b) ((a)>(b)?(b):(a))\n");
 
 // Declaration
 LU_DEFINE(declaration::typedef_int,
diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
index ac8b9e90a..18334faac 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
@@ -17,6 +17,13 @@ LU_DEFINE(header::cuda_prof_api, "#include <cuda_profiler_api.h>\n");
 LU_DEFINE(header::cuda_fp16, "#include <cuda_fp16.h>\n");
 
 // Macro
+LU_DEFINE(macro::HALF_MAX,
+          R"(#ifndef __HALF_COMPARE_EX__
+#define __HALF_COMPARE_EX__
+inline __device__ half max(half x, half y) { return x > y ? x : y; }
+inline __device__ half min(half x, half y) { return x < y ? x : y; }
+#endif)");
+
 LU_DEFINE(
     macro::CUDA_SAFE_CALL_NO_THROW,
     R"(#define CUDA_SAFE_CALL_NO_THROW(x)                                                                 \
diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp
index 93dbc0243..ae68e3e72 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.hpp
@@ -21,6 +21,7 @@ namespace nnfusion
 
         namespace macro
         {
+            LU_DECLARE(HALF_MAX);
             LU_DECLARE(CUDA_SAFE_CALL_NO_THROW);
             LU_DECLARE(CUDA_SAFE_CALL);
             LU_DECLARE(CUDNN_SAFE_CALL_NO_THROW);
diff --git a/src/nnfusion/core/operators/generic_op/generic_op_define/Convolution.cpp b/src/nnfusion/core/operators/generic_op/generic_op_define/Convolution.cpp
index 34c7f1f71..cd37a1696 100644
--- a/src/nnfusion/core/operators/generic_op/generic_op_define/Convolution.cpp
+++ b/src/nnfusion/core/operators/generic_op/generic_op_define/Convolution.cpp
@@ -57,7 +57,8 @@ REGISTER_OP(Convolution)
         {
             auto pad_template =
                 ".when([-@pad_0@ + HO + KH >= 0, -@pad_0@ + HO + KH < @height@, -@pad_1@ + WO + KW "
-                ">= 0, -@pad_1@ + WO + KW < @width@], 0.0)";
+                ">= 0, -@pad_1@ + WO + KW < @width@], "
+                "const(0.0).cast(@input0@@input0_layout@.dtype()))";
             pad_cond = op::create_code_from_template(pad_template, config);
         }
         config["pad_cond"] = pad_cond;
diff --git a/src/nnfusion/core/operators/generic_op/generic_op_define/DepthToSpace.cpp b/src/nnfusion/core/operators/generic_op/generic_op_define/DepthToSpace.cpp
index 0de6b0d83..727ffe888 100644
--- a/src/nnfusion/core/operators/generic_op/generic_op_define/DepthToSpace.cpp
+++ b/src/nnfusion/core/operators/generic_op/generic_op_define/DepthToSpace.cpp
@@ -48,7 +48,7 @@ REGISTER_OP(DepthToSpace)
     })
     .translate_v2([](std::shared_ptr<graph::GNode> curr) -> std::string {
         auto expression_template =
-            R"( temp0@mediate0_layout@ = @input0@@input0_layout@ @cond0@; temp1@mediate1_layout@ = temp0@mediate0_layout@; @output0@@output0_layout@ = temp1@mediate1o_layout@ @cond1@;  ## @: plan/advance_fusion )";
+            R"( temp0@mediate0_layout@ = @input0@@input0_layout@ @cond0@; temp1@mediate1_layout@ = temp0@mediate0_layout@; @output0@@output0_layout@ = temp1@mediate1o_layout@ @cond1@;)";
 
         auto input_shape = curr->get_input_shape(0);
         auto _op = std::dynamic_pointer_cast<nnfusion::op::GenericOp>(curr->get_op_ptr());
diff --git a/src/nnfusion/core/operators/generic_op/generic_op_define/DepthwiseConv2dNative.cpp b/src/nnfusion/core/operators/generic_op/generic_op_define/DepthwiseConv2dNative.cpp
index 0d680d385..45690effc 100644
--- a/src/nnfusion/core/operators/generic_op/generic_op_define/DepthwiseConv2dNative.cpp
+++ b/src/nnfusion/core/operators/generic_op/generic_op_define/DepthwiseConv2dNative.cpp
@@ -103,7 +103,8 @@ REGISTER_OP(DepthwiseConv2dNative)
         {
             auto pad_template =
                 ".when([-@pad_0@ + HO + KH >= 0, -@pad_0@ + HO + KH < @height@, -@pad_1@ + WO + KW "
-                ">= 0, -@pad_1@ + WO + KW < @width@], 0.0)";
+                ">= 0, -@pad_1@ + WO + KW < @width@], "
+                "const(0.0).cast(@input0@@input0_layout@.dtype()))";
             pad_cond = op::create_code_from_template(pad_template, config);
         }
         config["pad_cond"] = pad_cond;
diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
index 9dbc3fc84..ce2397562 100644
--- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
+++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
@@ -161,6 +161,7 @@ void CudaCodegenPass::initialize(std::shared_ptr<InterpreterContext> ctx,
     projgen->lup_codegen->require(macro::CUDA_SAFE_CALL);
     projgen->lup_codegen->require(macro::CUDNN_SAFE_CALL);
     projgen->lup_codegen->require(macro::CUBLAS_SAFE_CALL);
+    projgen->lup_codegen->require(macro::HALF_MAX);
 
     return;
 }
diff --git a/src/nnfusion/engine/pass/graph/kernel_tuning.cpp b/src/nnfusion/engine/pass/graph/kernel_tuning.cpp
index 00d3456ca..4a083ccc0 100644
--- a/src/nnfusion/engine/pass/graph/kernel_tuning.cpp
+++ b/src/nnfusion/engine/pass/graph/kernel_tuning.cpp
@@ -187,7 +187,7 @@ bool KernelTuning::register_antares_kernel()
             op_name,
             kernels::Name(op_name)
                 .Device(CUDA_GPU)
-                .TypeConstraint(DT_FLOAT)
+                .TypeConstraint(element::f32)
                 .Tag("antares")
                 .Priority(9)
                 .KernelFactory([](shared_ptr<kernels::KernelContext> context)
@@ -199,7 +199,7 @@ bool KernelTuning::register_antares_kernel()
             op_name,
             kernels::Name(op_name)
                 .Device(GENERIC_CPU)
-                .TypeConstraint(DT_FLOAT)
+                .TypeConstraint(element::f32)
                 .Tag("antares")
                 .Priority(9)
                 .KernelFactory([](shared_ptr<kernels::KernelContext> context)
@@ -211,7 +211,7 @@ bool KernelTuning::register_antares_kernel()
             op_name,
             kernels::Name(op_name)
                 .Device(HLSL)
-                .TypeConstraint(DT_FLOAT)
+                .TypeConstraint(element::f32)
                 .Tag("antares")
                 .Priority(9)
                 .KernelFactory([](shared_ptr<kernels::KernelContext> context)
diff --git a/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp b/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp
index 257aba94c..17d3a12ee 100644
--- a/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp
+++ b/src/nnfusion/frontend/tensorflow_import/util/graph_convert.cpp
@@ -59,6 +59,9 @@ namespace nnfusion
                             config[entry.first] = "float32";
                             break;
                         case ::tensorflow::DataType::DT_INT32: config[entry.first] = "int32"; break;
+                        case ::tensorflow::DataType::DT_HALF:
+                            config[entry.first] = "float16";
+                            break;
                         default: NNFUSION_CHECK(false) << "Unrecognized data type: " << dtype;
                         }
                     }
diff --git a/thirdparty/ngraph/src/nnfusion/core/operators/op_define/fused.cpp b/thirdparty/ngraph/src/nnfusion/core/operators/op_define/fused.cpp
index a4425f60f..b0962174b 100644
--- a/thirdparty/ngraph/src/nnfusion/core/operators/op_define/fused.cpp
+++ b/thirdparty/ngraph/src/nnfusion/core/operators/op_define/fused.cpp
@@ -175,5 +175,5 @@ void Fused::register_ir2(std::vector<std::shared_ptr<graph::GNode>>& gnodes)
     NNFUSION_LOG(INFO) << fused_op_ir2;
 
     // plan_rule = "## @: " + plan_rule;
-    plan_rule = "## @: plan/advance_fusion";
+    plan_rule = "";
 }

From 7d1216008515881b2f311b9f2b646b7fbbbb4313 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Fri, 4 Dec 2020 14:19:15 +0800
Subject: [PATCH 11/32] fix macro newline

---
 src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
index 18334faac..ea429d047 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_langunit.cpp
@@ -22,7 +22,8 @@ LU_DEFINE(macro::HALF_MAX,
 #define __HALF_COMPARE_EX__
 inline __device__ half max(half x, half y) { return x > y ? x : y; }
 inline __device__ half min(half x, half y) { return x < y ? x : y; }
-#endif)");
+#endif
+)");
 
 LU_DEFINE(
     macro::CUDA_SAFE_CALL_NO_THROW,

From f9209967feb535bc1628c403bf8ae3eddd1b1464 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Fri, 4 Dec 2020 15:11:15 +0800
Subject: [PATCH 12/32] check device type

---
 src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
index ce2397562..3875f582b 100644
--- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
+++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
@@ -805,7 +805,8 @@ void CudaCodegenPass::create_header_file(std::shared_ptr<InterpreterContext> ctx
     if (device_type() == CUDA_GPU || device_type() == ROCM_GPU)
         lu_header << header::cuda->get_code();
     // TODO only include this if half is used
-    lu_header << header::cuda_fp16->get_code();
+    if (device_type() == CUDA_GPU) 
+        lu_header << header::cuda_fp16->get_code();
 
     lu_header << "extern \"C\" int kernel_entry(";
     std::string params = get_kernel_entry_paras(tu);

From b71e6837de14c58406c626e08657fd9207074d3d Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Mon, 7 Dec 2020 16:39:17 +0800
Subject: [PATCH 13/32] code stype

---
 src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
index 3875f582b..25f5586a1 100644
--- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
+++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
@@ -805,7 +805,7 @@ void CudaCodegenPass::create_header_file(std::shared_ptr<InterpreterContext> ctx
     if (device_type() == CUDA_GPU || device_type() == ROCM_GPU)
         lu_header << header::cuda->get_code();
     // TODO only include this if half is used
-    if (device_type() == CUDA_GPU) 
+    if (device_type() == CUDA_GPU)
         lu_header << header::cuda_fp16->get_code();
 
     lu_header << "extern \"C\" int kernel_entry(";

From 76fe4fd76aaacb2541dffdb301b6f50cef072ca4 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Tue, 8 Dec 2020 15:26:34 +0800
Subject: [PATCH 14/32] fix ROCm unsupported LU's

---
 src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp     | 2 +-
 src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
index 7a9bef553..acd1939b6 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
@@ -380,7 +380,7 @@ LanguageUnit_p cuda::Dot::emit_dependency()
     _lu->require(header::sstream);
     _lu->require(macro::CUBLAS_SAFE_CALL);
     _lu->require(macro::CUDA_SAFE_CALL);
-    _lu->require(declaration::cuda_fp16_scale);
+    // _lu->require(declaration::cuda_fp16_scale);
     //_lu->require(declaration::cublas_handle);
     return _lu;
 }
diff --git a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
index 25f5586a1..dc535d584 100644
--- a/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
+++ b/src/nnfusion/engine/pass/codegen/cuda_codegen_pass.cpp
@@ -842,7 +842,7 @@ void CudaCodegenPass::create_main_file(std::shared_ptr<InterpreterContext> ctx,
     re_main->require(header::limits);
 
     re_main->require(header::cuda_prof_api);
-    re_main->require(header::cuda_fp16);
+    // re_main->require(header::cuda_fp16);
     re_main->require(macro::CUDA_SAFE_CALL);
 
     lu_main << "#include \"nnfusion_rt.h\"\n";

From a7bf41e4c6bb00bd2fe43e35d0347c14b4ffcf58 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Fri, 11 Dec 2020 14:02:15 +0800
Subject: [PATCH 15/32] fix DataBuffer

---
 .../ngraph/src/nnfusion/core/operators/op_define/constant.hpp  | 3 ++-
 .../src/nnfusion/frontend/onnx_import/util/graph_convert.cpp   | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/thirdparty/ngraph/src/nnfusion/core/operators/op_define/constant.hpp b/thirdparty/ngraph/src/nnfusion/core/operators/op_define/constant.hpp
index b6c2b0ee5..055be047a 100644
--- a/thirdparty/ngraph/src/nnfusion/core/operators/op_define/constant.hpp
+++ b/thirdparty/ngraph/src/nnfusion/core/operators/op_define/constant.hpp
@@ -109,8 +109,9 @@ namespace nnfusion
                     << nnfusion::shape_size(m_shape) << ").";
 
                 DataBuffer buf(element_type);
+                size_t shape_size = nnfusion::shape_size(m_shape);
 
-                buf.loadFromStrings(values);
+                buf.loadFromStrings(values, shape_size);
 
                 buf.dump(m_data);
             }
diff --git a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/graph_convert.cpp b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/graph_convert.cpp
index f6022d220..7dbb5a2c4 100644
--- a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/graph_convert.cpp
+++ b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/graph_convert.cpp
@@ -149,7 +149,7 @@ namespace nnfusion
                     onnx::ModelProto proto_without_init;
                     proto_without_init.CopyFrom(model_proto);
                     proto_without_init.mutable_graph()->mutable_initializer()->Clear();
-                    NNFUSION_LOG(INFO) << proto_without_init.DebugString();
+                    // NNFUSION_LOG(INFO) << proto_without_init.DebugString();
                 }
 
                 std::string

From 1e42180f75a4ba979ef4b4878db1f1afe470dce6 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Fri, 11 Dec 2020 17:10:16 +0800
Subject: [PATCH 16/32] onnx changed to DataBuffer style import

---
 .../frontend/onnx_import/core/tensor.hpp      |  50 ++----
 .../frontend/onnx_import/util/util.cpp        | 144 ++++++++++++++----
 .../frontend/onnx_import/util/util.hpp        |   5 +-
 3 files changed, 133 insertions(+), 66 deletions(-)

diff --git a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/core/tensor.hpp b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/core/tensor.hpp
index 415abc080..85a624b14 100644
--- a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/core/tensor.hpp
+++ b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/core/tensor.hpp
@@ -22,6 +22,7 @@
 #pragma once
 
 #include "../util/util.hpp"
+#include "ngraph/src/nnfusion/common/type/data_buffer.hpp"
 
 namespace nnfusion
 {
@@ -55,50 +56,31 @@ namespace nnfusion
                     return detail::get_data<T>(*m_tensor_proto);
                 }
 
+                DataBuffer buffer_get_data() const
+                {
+                    return detail::buffer_get_data(*m_tensor_proto);
+                }
+
                 const std::string& get_name() const
                 {
                     NNFUSION_CHECK(m_tensor_proto->has_name()) << "tensor has no name specified.";
                     return m_tensor_proto->name();
                 }
 
-                const element::Type& get_ng_type() const
+                element::Type get_ng_type() const
                 {
                     NNFUSION_CHECK(m_tensor_proto->has_data_type())
                         << "tensor has no data type specified.";
 
-                    switch (m_tensor_proto->data_type())
-                    {
-                    case onnx::TensorProto_DataType::TensorProto_DataType_BOOL:
-                        return element::boolean;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT:
-                    case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16:
-                        return element::f32;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE:
-                        return element::f64;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_INT8: return element::i8;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_INT16:
-                        return element::i16;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_INT32:
-                        return element::i32;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_INT64:
-                        return element::i64;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_UINT8: return element::u8;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_UINT16:
-                        return element::u16;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_UINT32:
-                        return element::u32;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_UINT64:
-                        return element::u64;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_UNDEFINED:
-                        NNFUSION_CHECK_FAIL() << "data type is not defined";
-                        break;
-                    default:
-                        NNFUSION_CHECK_FAIL()
-                            << "unsupported data type: "
-                            << onnx::TensorProto_DataType_Name(
-                                   onnx::TensorProto_DataType(m_tensor_proto->data_type()));
-                        break;
-                    }
+                    element::Type element_type;
+                    bool status;
+                    status = ONNXDataTypeToNNFusionElementType(
+                        static_cast<onnx::TensorProto_DataType>(m_tensor_proto->data_type()),
+                        &element_type);
+                    NNFUSION_CHECK(status) << "Data type not supported: "
+                                           << m_tensor_proto->data_type();
+
+                    return element_type;
                 }
 
                 operator onnx::TensorProto_DataType() const
diff --git a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/util.cpp b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/util.cpp
index d6f52653d..28a56620f 100644
--- a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/util.cpp
+++ b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/util.cpp
@@ -28,7 +28,7 @@ namespace nnfusion
     {
         namespace onnx_import
         {
-            bool ONNXDataTypeToNNFusionElementType(const onnx::TensorProto_DataType onnx_dt,
+            bool ONNXDataTypeToNNFusionElementType(onnx::TensorProto_DataType onnx_dt,
                                                    nnfusion::element::Type* nnfusion_et)
             {
                 switch (onnx_dt)
@@ -36,8 +36,10 @@ namespace nnfusion
                 case onnx::TensorProto_DataType::TensorProto_DataType_BOOL:
                     *nnfusion_et = element::boolean;
                     break;
-                case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT:
                 case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16:
+                    *nnfusion_et = element::f16;
+                    break;
+                case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT:
                     *nnfusion_et = element::f32;
                     break;
                 case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE:
@@ -86,35 +88,38 @@ namespace nnfusion
                                                            const Shape shape,
                                                            const Tensor& tensor)
             {
-                switch (onnx_et)
-                {
-                case onnx::TensorProto_DataType::TensorProto_DataType_BOOL:
-                    return make_constant_op<bool>(element::boolean, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT:
-                case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16:
-                    return make_constant_op<float>(element::f32, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE:
-                    return make_constant_op<double>(element::f64, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_INT8:
-                    return make_constant_op<int8_t>(element::i8, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_INT16:
-                    return make_constant_op<int16_t>(element::i16, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_INT32:
-                    return make_constant_op<int32_t>(element::i32, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_INT64:
-                    return make_constant_op<int64_t>(element::i64, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_UINT8:
-                    return make_constant_op<uint8_t>(element::u8, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_UINT16:
-                    return make_constant_op<uint16_t>(element::u16, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_UINT32:
-                    return make_constant_op<uint32_t>(element::u32, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_UINT64:
-                    return make_constant_op<uint64_t>(element::u64, shape, tensor);
-                default:
-                    NNFUSION_CHECK_FAIL() << "unsupported value info element type: "
-                                          << onnx::TensorProto_DataType_Name(onnx_et);
-                }
+                element::Type element_type = tensor.get_ng_type();
+                return std::make_shared<op::Constant>(
+                    element_type, shape, tensor.buffer_get_data());
+                // switch (onnx_et)
+                // {
+                // case onnx::TensorProto_DataType::TensorProto_DataType_BOOL:
+                //     return make_constant_op<bool>(element::boolean, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT:
+                // case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16:
+                //     return make_constant_op<float>(element::f32, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE:
+                //     return make_constant_op<double>(element::f64, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_INT8:
+                //     return make_constant_op<int8_t>(element::i8, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_INT16:
+                //     return make_constant_op<int16_t>(element::i16, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_INT32:
+                //     return make_constant_op<int32_t>(element::i32, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_INT64:
+                //     return make_constant_op<int64_t>(element::i64, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_UINT8:
+                //     return make_constant_op<uint8_t>(element::u8, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_UINT16:
+                //     return make_constant_op<uint16_t>(element::u16, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_UINT32:
+                //     return make_constant_op<uint32_t>(element::u32, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_UINT64:
+                //     return make_constant_op<uint64_t>(element::u64, shape, tensor);
+                // default:
+                //     NNFUSION_CHECK_FAIL() << "unsupported value info element type: "
+                //                           << onnx::TensorProto_DataType_Name(onnx_et);
+                // }
             }
 
             std::shared_ptr<graph::GNode> GetInputNode(const NodeMap& all_ng_nodes,
@@ -280,6 +285,83 @@ namespace nnfusion
                     name, std::vector<std::size_t>(kernel_shape.size(), 1UL));
             }
 
+            DataBuffer detail::buffer_get_data(const onnx::TensorProto& tensor)
+            {
+                size_t n_element = 1;
+                element::Type type;
+                bool status;
+                auto onnx_dt = static_cast<onnx::TensorProto_DataType>(tensor.data_type());
+
+                status = ONNXDataTypeToNNFusionElementType(onnx_dt, &type);
+
+                NNFUSION_CHECK(status) << "Unsupported ONNX data_type " << tensor.data_type()
+                                       << " is found";
+
+                DataBuffer buf(type);
+
+                for (auto dim : tensor.dims())
+                {
+                    n_element *= dim;
+                }
+                buf.resize(n_element);
+
+                if (tensor.has_raw_data())
+                {
+                    buf.load(tensor.raw_data().data(), n_element);
+                }
+                else
+                {
+#define GET_VALUE(pb_type, mid_type)                                                               \
+    do                                                                                             \
+    {                                                                                              \
+        const void* dat;                                                                           \
+        mid_type m;                                                                                \
+        NNFUSION_CHECK(n_element == tensor.pb_type##_data_size())                                  \
+            << "Tensor shape is not the same with tensor data_size. (" << n_element                \
+            << " != " << tensor.pb_type##_data_size() << ")";                                      \
+        for (size_t i = 0; i < n_element; ++i)                                                     \
+        {                                                                                          \
+            m = static_cast<mid_type>(tensor.pb_type##_data()[i]);                                 \
+            dat = reinterpret_cast<const void*>(&m);                                               \
+            buf.setElement(i, dat);                                                                \
+        }                                                                                          \
+    } while (0)
+
+                    switch (onnx_dt)
+                    {
+                    case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16:
+                        GET_VALUE(int32, element::half);
+                        break;
+                    case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT:
+                        GET_VALUE(float, float);
+                        break;
+                    case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE:
+                        GET_VALUE(double, double);
+                        break;
+                    case onnx::TensorProto_DataType::TensorProto_DataType_INT32:
+                        GET_VALUE(int32, int32_t);
+                        break;
+                    case onnx::TensorProto_DataType::TensorProto_DataType_INT64:
+                        GET_VALUE(int64, int64_t);
+                        break;
+                    case onnx::TensorProto_DataType::TensorProto_DataType_UINT64:
+                        GET_VALUE(uint64, uint64_t);
+                        break;
+                    case onnx::TensorProto_DataType::TensorProto_DataType_UINT32:
+                    case onnx::TensorProto_DataType::TensorProto_DataType_BOOL:
+                    case onnx::TensorProto_DataType::TensorProto_DataType_INT16:
+                    case onnx::TensorProto_DataType::TensorProto_DataType_INT8:
+                    case onnx::TensorProto_DataType::TensorProto_DataType_UINT8:
+                    case onnx::TensorProto_DataType::TensorProto_DataType_UINT16:
+                    default:
+                        NNFUSION_CHECK_FAIL() << "unsupported onnx element type: "
+                                              << onnx::TensorProto_DataType_Name(onnx_dt);
+                    }
+#undef GET_VALUE
+                }
+                return buf;
+            }
+
         } // namespace onnx_import
     }     // namespace frontend
 } // namespace nnfusion
diff --git a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/util.hpp b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/util.hpp
index 871ab4801..31bf27726 100644
--- a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/util.hpp
+++ b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/util/util.hpp
@@ -28,6 +28,7 @@
 #include <vector>
 
 #include "../onnx_base.hpp"
+#include "ngraph/src/nnfusion/common/type/data_buffer.hpp"
 #include "nnfusion/common/common.hpp"
 
 namespace nnfusion
@@ -51,6 +52,8 @@ namespace nnfusion
                     return {it, it + (raw_data.size() / sizeof(T))};
                 }
 
+                DataBuffer buffer_get_data(const onnx::TensorProto& tensor);
+
                 template <typename T>
                 inline std::vector<T> get_data(const onnx::TensorProto& tensor)
                 {
@@ -186,7 +189,7 @@ namespace nnfusion
             class Tensor;
             class Node;
 
-            bool ONNXDataTypeToNNFusionElementType(const onnx::TensorProto_DataType onnx_dt,
+            bool ONNXDataTypeToNNFusionElementType(onnx::TensorProto_DataType onnx_dt,
                                                    nnfusion::element::Type* nnfusion_et);
 
             template <typename T>

From 18ab500f7be9333757128e57ae8e446e4809e8fc Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Thu, 17 Dec 2020 11:22:50 +0800
Subject: [PATCH 17/32] fix onnx fp16

---
 src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp      | 4 ++--
 .../src/nnfusion/frontend/onnx_import/op/constant.hpp      | 7 +++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp b/src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp
index 76d40424d..42c782ae4 100644
--- a/src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp
+++ b/src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp
@@ -25,6 +25,6 @@ namespace nnfusion
             public:
                 bool run_on_graph(std::shared_ptr<nnfusion::graph::Graph>& graph) override;
             };
-        } // namespace pass
-    }     // namespace graph
+        } // namespace graph
+    }     // namespace pass
 } // namespace nnfusion
diff --git a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/op/constant.hpp b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/op/constant.hpp
index f3dd0bfc9..4280f9f86 100644
--- a/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/op/constant.hpp
+++ b/thirdparty/ngraph/src/nnfusion/frontend/onnx_import/op/constant.hpp
@@ -65,8 +65,11 @@ namespace nnfusion
                     Node node(node_proto);
                     auto tensor = node.get_attribute_value<Tensor>("value");
 
-                    const auto& func_param = ONNX_CONST_MAP().at(tensor.get_ng_type());
-                    auto op = func_param(tensor.get_ng_type(), tensor);
+                    // const auto& func_param = ONNX_CONST_MAP().at(tensor.get_ng_type());
+                    // auto op = func_param(tensor.get_ng_type(), tensor);
+                    auto op = std::make_shared<op::Constant>(
+                        tensor.get_ng_type(), tensor.get_shape(), tensor.buffer_get_data()
+                    );
 
                     op->set_name(node_proto.output(0));
                     auto gnode = m_graph->add_node_and_edge(op, graph::GNodeVector({}));

From 9dd7529da14c999bc047305d87b8b9b42312eb87 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Tue, 29 Dec 2020 13:34:29 +0800
Subject: [PATCH 18/32] bert l1 runnable

---
 .../core/kernels/cuda_gpu/cuda_helper.cpp     |   5 +
 .../kernels/cuda_gpu/kernels/batch_matmul.cpp |  18 +-
 .../core/kernels/cuda_gpu/kernels/dot.cpp     | 190 +++++++++---------
 .../core/kernels/cuda_gpu/kernels/reduce.hpp  |   2 +-
 .../core/kernels/cuda_gpu/kernels/softmax.cpp |   7 +-
 .../engine/pass/graph/kernel_tuning.cpp       |   6 +
 .../frontend/onnx_import/core/tensor.hpp      |   2 +-
 .../frontend/onnx_import/op/constant.hpp      |   3 +-
 .../frontend/onnx_import/ops_bridge.cpp       |   1 +
 .../frontend/onnx_import/util/util.hpp        |   2 +-
 10 files changed, 133 insertions(+), 103 deletions(-)

diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp
index db9856623..c48c0e41d 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp
@@ -33,6 +33,11 @@ LanguageUnit_p cuda::get_math_kernel(const std::string& name,
         writer << ")\n";
         writer << "{\n";
         writer.indent++;
+        if (name == "convert" && data_types[num_inputs] == "half" && data_types[0] == "int64_t")
+        {
+            writer << "return (long long)" + math_kernel << ";\n";
+        }
+        else
         {
             writer << "return " + math_kernel << ";\n";
         }
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp
index 173e95e93..c42d7780c 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp
@@ -8,6 +8,7 @@
 //   [a] ./new_kernel_0.cpp
 //   [b] ../../../ops/op_define/new_op_0.cpp
 
+#include <string>
 #include "../cuda_emitter.hpp"
 #include "../cuda_langunit.hpp"
 #include "nnfusion/core/operators/generic_op/generic_op.hpp"
@@ -52,6 +53,15 @@ namespace nnfusion
                     const nnfusion::Shape& input_shape_0 = m_context->inputs[0]->get_shape();
                     const nnfusion::Shape& input_shape_1 = m_context->inputs[1]->get_shape();
 
+                    element::Type dtype0 = m_context->inputs[0]->get_element_type();
+                    element::Type dtype1 = m_context->inputs[1]->get_element_type();
+                    element::Type dtype2 = m_context->outputs[0]->get_element_type();
+                    NNFUSION_CHECK(dtype0 == dtype1 && dtype1 == dtype2)
+                        << "Unsupported element type combination of (" << dtype0.c_type_string()
+                        << ", " << dtype1.c_type_string() << ") -> " << dtype2.c_type_string()
+                        << ".";
+                    element::Type& dtype = dtype0;
+
                     bool transA = generic_op->localOpConfig.getRoot()["adj_x"]["b"];
                     bool transB = generic_op->localOpConfig.getRoot()["adj_y"]["b"];
                     size_t A1 = 1LU;
@@ -92,10 +102,11 @@ namespace nnfusion
                         stride_b = A2 * A3, ldc = A4, stride_c = A2 * A4;
                     }
 
+                    std::string type = dtype.c_type_string();
                     float alpha = 1.0f, beta = 0.0f;
                     auto code = nnfusion::op::create_code_from_template(
                         R"(
-                        static const float alpha = @alpha@F, beta = @beta@F;
+                        static const @dtype@ alpha = @alpha@, beta = @beta@;
                         // if (!@hCublas@)
                         //     CUBLAS_SAFE_CALL(@api_create@(&@hCublas@));
                         CUBLAS_SAFE_CALL(@api_exec@(
@@ -106,7 +117,9 @@ namespace nnfusion
                         {
                             {"hCublas", "cublas_handle"},
                             {"api_create", "cublasCreate"},
-                            {"api_exec", "cublasSgemmStridedBatched"},
+                            {"api_exec",
+                             dtype == element::f32 ? "cublasSgemmStridedBatched"
+                                                   : "cublasHgemmStridedBatched"},
                             {"transA", transB ? "CUBLAS_OP_T" : "CUBLAS_OP_N"},
                             {"transB", transA ? "CUBLAS_OP_T" : "CUBLAS_OP_N"},
                             {"alpha", alpha},
@@ -121,6 +134,7 @@ namespace nnfusion
                             {"stride_b", stride_b},
                             {"stride_c", stride_c},
                             {"batch", A1},
+                            {"dtype", type},
                         });
 
                     LanguageUnit_p _lu(new LanguageUnit(get_function_name()));
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
index acd1939b6..24178ecb2 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
@@ -86,7 +86,7 @@ LanguageUnit_p cuda::Dot::emit_function_body()
         // matrix * vector
         else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1))
         {
-            lu << "const float alpha = 1.0;\n const float beta = 0;\n";
+            lu << "const float alpha = 1.0;\n const float beta = 0.;\n";
             lu << "CUBLAS_SAFE_CALL(cublasSgemv(cublas_handle, ";
             if (trans_A)
                 lu << "CUBLAS_OP_N, " << arg0_shape[0] << ", " << arg0_shape[1] << ", ";
@@ -107,7 +107,7 @@ LanguageUnit_p cuda::Dot::emit_function_body()
             int n = trans_A ? arg0_shape[1] : arg0_shape[0];
             int k = trans_A ? arg0_shape[0] : arg0_shape[1];
 
-            lu << "const float alpha = 1.0;\nconst float beta = 0;\n";
+            lu << "const float alpha = 1.0;\nconst float beta = 0.;\n";
 
             lu << "CUBLAS_SAFE_CALL(cublasSgemm(cublas_handle,"
                << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,")
@@ -186,7 +186,7 @@ LanguageUnit_p cuda::Dot::emit_function_body()
                 }
             }
 
-            lu << "const float alpha = 1.0;\nconst float beta = 0;\n";
+            lu << "const float alpha = 1.0;\nconst float beta = 0.;\n";
 
             lu << "CUBLAS_SAFE_CALL(cublasSgemm(cublas_handle,"
                << " CUBLAS_OP_N,"
@@ -261,111 +261,113 @@ LanguageUnit_p cuda::Dot::emit_function_body()
         //     << " static_cast<float*>(output0),"
         //     << " 1));\n";
         // }
-        // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) &&
-        //         (trans_A || trans_B))
-        // {
-        //     int m = trans_B ? arg1_shape[0] : arg1_shape[1];
-        //     int n = trans_A ? arg0_shape[1] : arg0_shape[0];
-        //     int k = trans_A ? arg0_shape[0] : arg0_shape[1];
-
-        //     lu << "const half alpha = 1.0;\nconst half beta = 0;\n";
-
-        //     lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
-        //     << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,")
-        //     << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << ","
-        //     << " " << n << ","
-        //     << " " << k << ","
-        //     << " &alpha,"
-        //     << " static_cast<const half*>(input1),"
-        //     << " " << arg1_shape[1] << ","
-        //     << " static_cast<const half*>(input0),"
-        //     << " " << arg0_shape[1] << ","
-        //     << " &beta,"
-        //     << " static_cast<half*>(output0),"
-        //     << " " << m << "));\n";
-        // } else {
-        size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
-        size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
-        size_t axes_for_k_count = reduction_axes;
-        size_t m = 1;
-        size_t n = 1;
-        size_t k = 1;
-
-        // check if input and output size correct
-        // check and calculate k for arg0 and arg1
-        size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
-        size_t arg1_k_idx = 0;                // first axe in arg1 for k
-
-        for (size_t i = 0; i < axes_for_k_count; i++)
+        if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) &&
+            (trans_A || trans_B))
         {
-            k *= arg0_shape[arg0_k_idx];
-            if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
-            {
-                std::vector<std::string> arg_vec{"arg0", "arg1"};
-                std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
+            int m = trans_B ? arg1_shape[0] : arg1_shape[1];
+            int n = trans_A ? arg0_shape[1] : arg0_shape[0];
+            int k = trans_A ? arg0_shape[0] : arg0_shape[1];
 
-                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                      << nnfusion::join(shape_vec) << " respectively, at Node "
-                                      << m_context->gnode->get_name()
-                                      << ", do not match for dot op";
-            }
+            lu << "const half alpha = 1.0;\nconst half beta = 0.;\n";
+
+            lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
+               << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,")
+               << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << ","
+               << " " << n << ","
+               << " " << k << ","
+               << " &alpha,"
+               << " static_cast<const half*>(input1),"
+               << " " << arg1_shape[1] << ","
+               << " static_cast<const half*>(input0),"
+               << " " << arg0_shape[1] << ","
+               << " &beta,"
+               << " static_cast<half*>(output0),"
+               << " " << m << "));\n";
         }
-        // check and calculate m for arg0 and out
-        size_t arg0_m_idx = 0; // first axe in arg0 for m
-        size_t out_m_idx = 0;  // first axe in out for m
-        for (size_t i = 0; i < axes_for_m_count; i++)
+        else
         {
-            m *= arg0_shape[arg0_m_idx];
-            if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
+            size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
+            size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
+            size_t axes_for_k_count = reduction_axes;
+            size_t m = 1;
+            size_t n = 1;
+            size_t k = 1;
+
+            // check if input and output size correct
+            // check and calculate k for arg0 and arg1
+            size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
+            size_t arg1_k_idx = 0;                // first axe in arg1 for k
+
+            for (size_t i = 0; i < axes_for_k_count; i++)
             {
-                std::vector<std::string> arg_vec{"arg0", "output"};
-                std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
+                k *= arg0_shape[arg0_k_idx];
+                if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
+                {
+                    std::vector<std::string> arg_vec{"arg0", "arg1"};
+                    std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
 
-                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                      << nnfusion::join(shape_vec) << " respectively, at Node "
-                                      << m_context->gnode->get_name()
-                                      << ", do not match for dot op";
+                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                          << nnfusion::join(shape_vec) << " respectively, at Node "
+                                          << m_context->gnode->get_name()
+                                          << ", do not match for dot op";
+                }
             }
-        }
-        // check and calculate n for arg1 and out
-        size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
-        size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
-        for (size_t i = 0; i < axes_for_n_count; i++)
-        {
-            n *= arg1_shape[arg1_n_idx];
-            if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
+            // check and calculate m for arg0 and out
+            size_t arg0_m_idx = 0; // first axe in arg0 for m
+            size_t out_m_idx = 0;  // first axe in out for m
+            for (size_t i = 0; i < axes_for_m_count; i++)
             {
-                std::vector<std::string> arg_vec{"arg1", "output"};
-                std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
+                m *= arg0_shape[arg0_m_idx];
+                if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
+                {
+                    std::vector<std::string> arg_vec{"arg0", "output"};
+                    std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
 
-                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                      << nnfusion::join(shape_vec) << " respectively, at Node "
-                                      << m_context->gnode->get_name()
-                                      << ", do not match for dot op";
+                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                          << nnfusion::join(shape_vec) << " respectively, at Node "
+                                          << m_context->gnode->get_name()
+                                          << ", do not match for dot op";
+                }
             }
-        }
+            // check and calculate n for arg1 and out
+            size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
+            size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
+            for (size_t i = 0; i < axes_for_n_count; i++)
+            {
+                n *= arg1_shape[arg1_n_idx];
+                if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
+                {
+                    std::vector<std::string> arg_vec{"arg1", "output"};
+                    std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
 
-        lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
-
-        lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
-           << " CUBLAS_OP_N,"
-           << " CUBLAS_OP_N,"
-           << " " << n << ","
-           << " " << m << ","
-           << " " << k << ","
-           << " &alpha,"
-           << " static_cast<const half*>(input1),"
-           << " " << n << ","
-           << " static_cast<const half*>(input0),"
-           << " " << k << ","
-           << " &beta,"
-           << " static_cast<half*>(output0),"
-           << " " << n << "));\n";
-        // }
+                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                          << nnfusion::join(shape_vec) << " respectively, at Node "
+                                          << m_context->gnode->get_name()
+                                          << ", do not match for dot op";
+                }
+            }
+
+            lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
+
+            lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
+               << " CUBLAS_OP_N,"
+               << " CUBLAS_OP_N,"
+               << " " << n << ","
+               << " " << m << ","
+               << " " << k << ","
+               << " &alpha,"
+               << " static_cast<const half*>(input1),"
+               << " " << n << ","
+               << " static_cast<const half*>(input0),"
+               << " " << k << ","
+               << " &beta,"
+               << " static_cast<half*>(output0),"
+               << " " << n << "));\n";
+        }
     }
     else
     {
-        NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot.";
+        NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for kernel dot.";
     }
     //lu.block_end();
     return _lu;
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.hpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.hpp
index 97353e5e8..c9bfb3c26 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.hpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.hpp
@@ -189,7 +189,7 @@ int data_idx_offset = block_idx * width;
 float val = 0.0;
 for (int tidx = thread_idx; tidx < width; tidx += block_size) {
     int data_idx = tidx + data_idx_offset;
-    val += input0[data_idx];
+    val += static_cast<float>(input0[data_idx]);
 }
 val = reduceSum(val, thread_idx, block_size, shm);
 if (thread_idx == 0) output0[block_idx] = val;
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp
index c653abedc..78e7af632 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "softmax.hpp"
+#include "../cuda_cudnn.hpp"
 #include "nnfusion/core/operators/generic_op/generic_op.hpp"
 
 using namespace nnfusion;
@@ -22,7 +23,8 @@ LanguageUnit_p
 {
     LanguageUnit_p _lu(new LanguageUnit);
     auto& lu = *_lu;
-    string data_type = "CUDNN_DATA_FLOAT"; //cuda::get_cudnn_datatype(type);
+    element::Type type = m_context->inputs[0]->get_element_type();
+    string data_type = cuda::get_cudnn_datatype(type);
     string tensor_format = "CUDNN_TENSOR_NCHW";
     lu << "cudnnTensorDescriptor_t " << desc << ";\n";
     lu << "CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&" << desc << "));\n";
@@ -154,7 +156,8 @@ LanguageUnit_p
 {
     LanguageUnit_p _lu(new LanguageUnit);
     auto& lu = *_lu;
-    string data_type = "CUDNN_DATA_FLOAT"; //cuda::get_cudnn_datatype(type);
+    element::Type type = m_context->inputs[0]->get_element_type();
+    string data_type = cuda::get_cudnn_datatype(type);
     string tensor_format = "CUDNN_TENSOR_NCHW";
     lu << "cudnnTensorDescriptor_t " << desc << ";\n";
     lu << "CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&" << desc << "));\n";
diff --git a/src/nnfusion/engine/pass/graph/kernel_tuning.cpp b/src/nnfusion/engine/pass/graph/kernel_tuning.cpp
index 9fa4ae1bc..5132845c3 100644
--- a/src/nnfusion/engine/pass/graph/kernel_tuning.cpp
+++ b/src/nnfusion/engine/pass/graph/kernel_tuning.cpp
@@ -91,6 +91,12 @@ void print_tuning_results(std::vector<std::shared_ptr<TuningStatus>> tuned_kerne
            << std::setw(10) << s->status << " | " << std::setw(6) << s->progress_step << "/"
            << FLAGS_fkernel_tuning_steps << " "
            << " | " << std::setw(12) << s->best_perf << " ms |\n";
+
+        if (fabs(s->best_perf + 1.0) < 1e-5)
+        {
+            NNFUSION_LOG(INFO) << "Kernel named \"" << s->op_name << "\" has not yet been tuned.\n"
+                               << s->ir;
+        }
     }
     NNFUSION_LOG(INFO) << ss.str();
 }
diff --git a/src/nnfusion/frontend/onnx_import/core/tensor.hpp b/src/nnfusion/frontend/onnx_import/core/tensor.hpp
index 85a624b14..201aa580d 100644
--- a/src/nnfusion/frontend/onnx_import/core/tensor.hpp
+++ b/src/nnfusion/frontend/onnx_import/core/tensor.hpp
@@ -22,7 +22,7 @@
 #pragma once
 
 #include "../util/util.hpp"
-#include "ngraph/src/nnfusion/common/type/data_buffer.hpp"
+#include "nnfusion/common/type/data_buffer.hpp"
 
 namespace nnfusion
 {
diff --git a/src/nnfusion/frontend/onnx_import/op/constant.hpp b/src/nnfusion/frontend/onnx_import/op/constant.hpp
index 4280f9f86..87163617d 100644
--- a/src/nnfusion/frontend/onnx_import/op/constant.hpp
+++ b/src/nnfusion/frontend/onnx_import/op/constant.hpp
@@ -68,8 +68,7 @@ namespace nnfusion
                     // const auto& func_param = ONNX_CONST_MAP().at(tensor.get_ng_type());
                     // auto op = func_param(tensor.get_ng_type(), tensor);
                     auto op = std::make_shared<op::Constant>(
-                        tensor.get_ng_type(), tensor.get_shape(), tensor.buffer_get_data()
-                    );
+                        tensor.get_ng_type(), tensor.get_shape(), tensor.buffer_get_data());
 
                     op->set_name(node_proto.output(0));
                     auto gnode = m_graph->add_node_and_edge(op, graph::GNodeVector({}));
diff --git a/src/nnfusion/frontend/onnx_import/ops_bridge.cpp b/src/nnfusion/frontend/onnx_import/ops_bridge.cpp
index d63fbf34a..9724b435a 100644
--- a/src/nnfusion/frontend/onnx_import/ops_bridge.cpp
+++ b/src/nnfusion/frontend/onnx_import/ops_bridge.cpp
@@ -134,6 +134,7 @@ namespace nnfusion
                 REGISTER_EMPTY_DOMAIN("ai.onnx.ml");
                 REGISTER_EMPTY_DOMAIN("com.microsoft");
                 REGISTER_EMPTY_DOMAIN("com.microsoft.mlfeaturizers");
+                REGISTER_EMPTY_DOMAIN("ai.onnx.preview.training");
                 REGISTER_OPERATOR("Abs", 1, TranslateUnaryOp<op::Abs>);
                 REGISTER_OPERATOR("Acos", 1, TranslateUnaryOp<op::Acos>);
                 REGISTER_OPERATOR("AdamOptimizer", 1, TranslateAdamOptimizerOp);
diff --git a/src/nnfusion/frontend/onnx_import/util/util.hpp b/src/nnfusion/frontend/onnx_import/util/util.hpp
index 31bf27726..2f7262842 100644
--- a/src/nnfusion/frontend/onnx_import/util/util.hpp
+++ b/src/nnfusion/frontend/onnx_import/util/util.hpp
@@ -28,8 +28,8 @@
 #include <vector>
 
 #include "../onnx_base.hpp"
-#include "ngraph/src/nnfusion/common/type/data_buffer.hpp"
 #include "nnfusion/common/common.hpp"
+#include "nnfusion/common/type/data_buffer.hpp"
 
 namespace nnfusion
 {

From f015bc62893d398ba52e398ddc6a25e436b3426f Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Tue, 5 Jan 2021 17:07:37 +0800
Subject: [PATCH 19/32] change priority of evaluator runtime

---
 src/nnfusion/frontend/util/evaluator.hpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/nnfusion/frontend/util/evaluator.hpp b/src/nnfusion/frontend/util/evaluator.hpp
index 1e5a56e36..9edbc43c0 100644
--- a/src/nnfusion/frontend/util/evaluator.hpp
+++ b/src/nnfusion/frontend/util/evaluator.hpp
@@ -105,21 +105,21 @@ namespace nnfusion
                 nnfusion::profiler::IProfilingRuntime::Pointer runtime = nullptr;
                 std::vector<shared_ptr<const KernelRegistration>> kernel_regs;
 
-                runtime = nnfusion::profiler::RocmDefaultRuntime::Runtime();
+                runtime = nnfusion::profiler::CudaDefaultRuntime::Runtime();
                 if (runtime->check_env())
                 {
                     kernel_regs = KernelRegistry::Global()->FindKernelRegistrations(
-                        gnode->get_op_type(), ROCM_GPU, element::f32);
-                    if (kernel_regs.size() == 0)
-                        kernel_regs = KernelRegistry::Global()->FindKernelRegistrations(
-                            gnode->get_op_type(), CUDA_GPU, element::f32);
+                        gnode->get_op_type(), CUDA_GPU, element::f32);
                 }
                 else
                 {
-                    runtime = nnfusion::profiler::CudaDefaultRuntime::Runtime();
+                    runtime = nnfusion::profiler::RocmDefaultRuntime::Runtime();
                     NNFUSION_CHECK(runtime->check_env());
                     kernel_regs = KernelRegistry::Global()->FindKernelRegistrations(
-                        gnode->get_op_type(), CUDA_GPU, element::f32);
+                        gnode->get_op_type(), ROCM_GPU, element::f32);
+                    if (kernel_regs.size() == 0)
+                        kernel_regs = KernelRegistry::Global()->FindKernelRegistrations(
+                            gnode->get_op_type(), CUDA_GPU, element::f32);
                 }
 
                 bool const_infer_success = false;

From 86ccb2d1ee99723a96f641d1a1e9eed74fbff2d8 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Wed, 25 Nov 2020 12:42:56 +0000
Subject: [PATCH 20/32] changes for fp16

---
 .../core/kernels/cpu/reference/constant.cpp   |  2 +-
 .../core/kernels/cpu/reference/variable.cpp   |  2 +-
 .../kernels/cuda_gpu/kernels/apply_adam.cpp   |  2 +-
 .../kernels/cuda_gpu/kernels/constant.cpp     |  2 +-
 .../kernels/cuda_gpu/kernels/convolution.cpp  |  1 +
 .../core/kernels/cuda_gpu/kernels/dot.cpp     | 80 +++++++++++++++++++
 .../cuda_gpu/kernels/dynamic_stitch.cpp       |  2 +-
 .../core/kernels/cuda_gpu/kernels/pad.cpp     |  2 +-
 .../core/kernels/cuda_gpu/kernels/range.cpp   |  2 +-
 .../core/kernels/cuda_gpu/kernels/result.cpp  |  2 +-
 .../core/kernels/cuda_gpu/kernels/reverse.cpp |  2 +-
 .../cuda_gpu/kernels/reverse_sequence.cpp     |  2 +-
 .../cuda_gpu/kernels/strided_slice_grad.cpp   |  2 +-
 .../core/kernels/cuda_gpu/kernels/tile.cpp    |  2 +-
 .../kernels/cuda_gpu/kernels/variable.cpp     |  2 +-
 .../core/kernels/kernel_registration.cpp      |  1 +
 16 files changed, 95 insertions(+), 13 deletions(-)

diff --git a/src/nnfusion/core/kernels/cpu/reference/constant.cpp b/src/nnfusion/core/kernels/cpu/reference/constant.cpp
index 7917d10ad..b922d2cee 100644
--- a/src/nnfusion/core/kernels/cpu/reference/constant.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/constant.cpp
@@ -71,4 +71,4 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("Constant",                                       //op_name
                         Device(GENERIC_CPU).TypeConstraint(element::f32), //attrs
-                        cpu::Constant)                                    // constructor
\ No newline at end of file
+                        cpu::Constant)                                    // constructor
diff --git a/src/nnfusion/core/kernels/cpu/reference/variable.cpp b/src/nnfusion/core/kernels/cpu/reference/variable.cpp
index 5e16388f6..a4eeeea2b 100644
--- a/src/nnfusion/core/kernels/cpu/reference/variable.cpp
+++ b/src/nnfusion/core/kernels/cpu/reference/variable.cpp
@@ -69,4 +69,4 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("Variable",                                       //op_name
                         Device(GENERIC_CPU).TypeConstraint(element::f32), //attrs
-                        cpu::Variable)                                    // constructor
\ No newline at end of file
+                        cpu::Variable)                                    // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp
index e42e0eda9..44308b801 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/apply_adam.cpp
@@ -108,4 +108,4 @@ using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER(
     "ApplyAdam",
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2),
-    cuda::ApplyAdam)
\ No newline at end of file
+    cuda::ApplyAdam)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp
index 73f04f5fd..d43bcca07 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/constant.cpp
@@ -121,4 +121,4 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("Constant",                                                //op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs
-                        cuda::Constant)                                            // constructor
\ No newline at end of file
+                        cuda::Constant)                                            // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp
index 6b91e3956..549d428c6 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp
@@ -87,6 +87,7 @@ LanguageUnit_p cuda::ConvolutionCudnn::emit_function_body()
         padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
     }
 
+
     {
         // lu << "cudnnDataType_t data_type = " << get_cudnn_datatype(dtype) << ";\n";
         lu << cudnn_tensor_descriptor_from_shape(input_shape, "tensor_desc_0", input_type)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
index acd1939b6..23e7761cf 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
@@ -203,6 +203,86 @@ LanguageUnit_p cuda::Dot::emit_function_body()
                << " static_cast<float*>(output0),"
                << " " << n << "));\n";
         }
+    } else if (dtype == element::f16) {
+        size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
+        size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
+        size_t axes_for_k_count = reduction_axes;
+        size_t m = 1;
+        size_t n = 1;
+        size_t k = 1;
+
+        // check if input and output size correct
+        // check and calculate k for arg0 and arg1
+        size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
+        size_t arg1_k_idx = 0;                // first axe in arg1 for k
+
+        for (size_t i = 0; i < axes_for_k_count; i++)
+        {
+            k *= arg0_shape[arg0_k_idx];
+            if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
+            {
+                std::vector<std::string> arg_vec{"arg0", "arg1"};
+                std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
+
+                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                        << nnfusion::join(shape_vec) << " respectively, at Node "
+                                        << m_context->gnode->get_name()
+                                        << ", do not match for dot op";
+            }
+        }
+        // check and calculate m for arg0 and out
+        size_t arg0_m_idx = 0; // first axe in arg0 for m
+        size_t out_m_idx = 0;  // first axe in out for m
+        for (size_t i = 0; i < axes_for_m_count; i++)
+        {
+            m *= arg0_shape[arg0_m_idx];
+            if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
+            {
+                std::vector<std::string> arg_vec{"arg0", "output"};
+                std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
+
+                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                        << nnfusion::join(shape_vec) << " respectively, at Node "
+                                        << m_context->gnode->get_name()
+                                        << ", do not match for dot op";
+            }
+        }
+        // check and calculate n for arg1 and out
+        size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
+        size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
+        for (size_t i = 0; i < axes_for_n_count; i++)
+        {
+            n *= arg1_shape[arg1_n_idx];
+            if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
+            {
+                std::vector<std::string> arg_vec{"arg1", "output"};
+                std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
+
+                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                        << nnfusion::join(shape_vec) << " respectively, at Node "
+                                        << m_context->gnode->get_name()
+                                        << ", do not match for dot op";
+            }
+        }
+
+        lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
+
+        lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
+            << " CUBLAS_OP_N,"
+            << " CUBLAS_OP_N,"
+            << " " << n << ","
+            << " " << m << ","
+            << " " << k << ","
+            << " &alpha,"
+            << " static_cast<const half*>(input1),"
+            << " " << n << ","
+            << " static_cast<const half*>(input0),"
+            << " " << k << ","
+            << " &beta,"
+            << " static_cast<half*>(output0),"
+            << " " << n << "));\n";
+    } else {
+        NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot."
     }
     else if (dtype == element::f16)
     {
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp
index 4bd847949..00cd81136 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dynamic_stitch.cpp
@@ -124,4 +124,4 @@ LanguageUnit_p cuda::DynamicStitch::emit_dependency()
 REGISTER_KERNEL_EMITTER(
     "DynamicStitch",                                                              // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::DynamicStitch)                                                          // constructor
\ No newline at end of file
+    cuda::DynamicStitch)                                                          // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp
index faab94fe9..733c037e4 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/pad.cpp
@@ -149,4 +149,4 @@ KernelRegistrar kernel_registrar0(
 REGISTER_KERNEL_EMITTER(
     "Pad",                                                                        // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::Pad)                                                                    // constructor
\ No newline at end of file
+    cuda::Pad)                                                                    // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp
index 1c5a30279..f7a06a159 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/range.cpp
@@ -65,4 +65,4 @@ LanguageUnit_p cuda::Range::emit_dependency()
 REGISTER_KERNEL_EMITTER(
     "Range",                                                                      // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::Range)                                                                  // constructor
\ No newline at end of file
+    cuda::Range)                                                                  // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp
index 229580e6a..73b819fd7 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/result.cpp
@@ -90,4 +90,4 @@ LanguageUnit_p cuda::Result::emit_dependency()
 REGISTER_KERNEL_EMITTER(
     "Result",                                                                  // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_lib").Priority(2), // attrs
-    cuda::Result)                                                              // constructor
\ No newline at end of file
+    cuda::Result)                                                              // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp
index 6d5fc374d..e3be51ffc 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse.cpp
@@ -103,4 +103,4 @@ LanguageUnit_p cuda::Reverse::emit_dependency()
 REGISTER_KERNEL_EMITTER(
     "Reverse",                                                                    // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::Reverse)                                                                // constructor
\ No newline at end of file
+    cuda::Reverse)                                                                // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp
index 487951930..612c51730 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reverse_sequence.cpp
@@ -130,4 +130,4 @@ REGISTER_KERNEL_EMITTER(
 
 REGISTER_KERNEL_EMITTER("ReverseSequence",                                         // op_name
                         Device(ROCM_GPU).TypeConstraint(element::f32).Priority(2), // attrs
-                        cuda::RocmReverseSequence)                                 // constructor
\ No newline at end of file
+                        cuda::RocmReverseSequence)                                 // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp
index 342edf949..ab27b0ec7 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/strided_slice_grad.cpp
@@ -115,4 +115,4 @@ LanguageUnit_p cuda::StridedSliceGrad::emit_dependency()
 REGISTER_KERNEL_EMITTER(
     "StridedSliceGrad",                                                           // op_name
     Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cuda_kernel").Priority(2), // attrs
-    cuda::StridedSliceGrad)                                                       // constructor
\ No newline at end of file
+    cuda::StridedSliceGrad)                                                       // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp
index 33a869e71..6dc5220d9 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/tile.cpp
@@ -153,4 +153,4 @@ REGISTER_KERNEL_EMITTER(
 
 REGISTER_KERNEL_EMITTER("Tile",                                                    //op_name
                         Device(ROCM_GPU).TypeConstraint(element::f32).Priority(2), //attrs
-                        cuda::RocmTile)                                            // constructor
\ No newline at end of file
+                        cuda::RocmTile)                                            // constructor
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp
index 419124649..80c5cc707 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/variable.cpp
@@ -79,4 +79,4 @@ using namespace nnfusion;
 using namespace nnfusion::kernels;
 REGISTER_KERNEL_EMITTER("Variable",                                                //op_name
                         Device(CUDA_GPU).TypeConstraint(element::f32).Priority(2), //attrs
-                        cuda::Variable)                                            // constructor
\ No newline at end of file
+                        cuda::Variable)                                            // constructor
diff --git a/src/nnfusion/core/kernels/kernel_registration.cpp b/src/nnfusion/core/kernels/kernel_registration.cpp
index 67ded5b8f..9aba33df2 100644
--- a/src/nnfusion/core/kernels/kernel_registration.cpp
+++ b/src/nnfusion/core/kernels/kernel_registration.cpp
@@ -4,6 +4,7 @@
 #include "kernel_registration.hpp"
 #include "nnfusion/common/type/element_type.hpp"
 #include "nnfusion/util/util.hpp"
+#include "ngraph/src/nnfusion/common/type/element_type.hpp"
 
 using namespace nnfusion;
 using namespace nnfusion::kernels;

From e9fa2861d15546c376c9d3a0c61af81849d4f448 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Fri, 27 Nov 2020 09:00:51 +0000
Subject: [PATCH 21/32] vgg11 runnable

---
 .../core/kernels/cuda_gpu/kernels/dot.cpp     | 210 ++++---
 .../pass/graph/codegen_dxcompute_pass.hpp     | 533 ++++++++++++++++++
 2 files changed, 678 insertions(+), 65 deletions(-)
 create mode 100644 src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp

diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
index 23e7761cf..7838e4300 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
@@ -204,83 +204,163 @@ LanguageUnit_p cuda::Dot::emit_function_body()
                << " " << n << "));\n";
         }
     } else if (dtype == element::f16) {
-        size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
-        size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
-        size_t axes_for_k_count = reduction_axes;
-        size_t m = 1;
-        size_t n = 1;
-        size_t k = 1;
+        // case 1: Scalar * Tensor
+        // if (arg0_shape.empty() || arg1_shape.empty())
+        // {
+        //     auto& second = (arg0_shape.empty() ? arg1_shape : arg0_shape);
+        //     size_t count = nnfusion::shape_size(second);
 
-        // check if input and output size correct
-        // check and calculate k for arg0 and arg1
-        size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
-        size_t arg1_k_idx = 0;                // first axe in arg1 for k
+        //     string firstarg = (arg0_shape.empty() ? "input1" : "input0");
+        //     string secondarg = (arg0_shape.empty() ? "input0" : "input1");
 
-        for (size_t i = 0; i < axes_for_k_count; i++)
-        {
-            k *= arg0_shape[arg0_k_idx];
-            if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
+        //     lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
+
+        //     lu << "CUDA_SAFE_CALL(cudaMemcpy(outupt0, " << firstarg << ", " << count << ", cudaMemcpyDeviceToDevice));\n";     // copy `firstarg` to `output0`
+        //     lu << "CUBLAS_SAFE_CALL(nnfusionHalfScale(" << secondarg << ", output0, " << count << "));\n";
+        // }
+        // // case 2: 1d Dot
+        // else if ((arg0_shape.size() == arg1_shape.size()) && (arg0_shape.size() == reduction_axes))
+        // {
+        //     for (int i = 0; i < arg0_shape.size(); i++)
+        //     {
+        //         if (arg0_shape[i] != arg1_shape[i])
+        //         {
+        //             std::vector<std::string> arg_vec{"arg0", "arg1"};
+        //             std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
+
+        //             NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+        //                                 << nnfusion::join(shape_vec) << " respectively, at Node "
+        //                                 << m_context->gnode->get_name()
+        //                                 << ", do not match for dot op";
+        //         }
+        //     }
+
+        //     size_t count = nnfusion::shape_size(arg0_shape);
+        //     lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
+
+        //     lu << "CUBLAS_SAFE_CALL(cublasSdot(cublas_handle, " << count
+        //     << ", static_cast<const float*>(input0), 1, static_cast<const float*>(input1), 1, "
+        //         "static_cast<float*>(output0)));\n";
+        // }
+        // // matrix * vector
+        // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1))
+        // {
+        //     lu << "const float alpha = 1.0;\n const float beta = 0;\n";
+        //     lu << "CUBLAS_SAFE_CALL(cublasSgemv(cublas_handle, ";
+        //     if (trans_A)
+        //         lu << "CUBLAS_OP_N, " << arg0_shape[0] << ", " << arg0_shape[1] << ", ";
+        //     else
+        //         lu << "CUBLAS_OP_T, " << arg0_shape[1] << ", " << arg0_shape[0] << ", ";
+        //     lu << " &alpha,"
+        //     << " static_cast<const float*>(input0)," << arg0_shape[1] << ", "
+        //     << " static_cast<const float*>(input1),"
+        //     << " 1,"
+        //     << " &beta,"
+        //     << " static_cast<float*>(output0),"
+        //     << " 1));\n";
+        // }
+        // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) &&
+        //         (trans_A || trans_B))
+        // {
+        //     int m = trans_B ? arg1_shape[0] : arg1_shape[1];
+        //     int n = trans_A ? arg0_shape[1] : arg0_shape[0];
+        //     int k = trans_A ? arg0_shape[0] : arg0_shape[1];
+
+        //     lu << "const half alpha = 1.0;\nconst half beta = 0;\n";
+
+        //     lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
+        //     << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,")
+        //     << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << ","
+        //     << " " << n << ","
+        //     << " " << k << ","
+        //     << " &alpha,"
+        //     << " static_cast<const half*>(input1),"
+        //     << " " << arg1_shape[1] << ","
+        //     << " static_cast<const half*>(input0),"
+        //     << " " << arg0_shape[1] << ","
+        //     << " &beta,"
+        //     << " static_cast<half*>(output0),"
+        //     << " " << m << "));\n";
+        // } else {
+            size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
+            size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
+            size_t axes_for_k_count = reduction_axes;
+            size_t m = 1;
+            size_t n = 1;
+            size_t k = 1;
+
+            // check if input and output size correct
+            // check and calculate k for arg0 and arg1
+            size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
+            size_t arg1_k_idx = 0;                // first axe in arg1 for k
+
+            for (size_t i = 0; i < axes_for_k_count; i++)
             {
-                std::vector<std::string> arg_vec{"arg0", "arg1"};
-                std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
+                k *= arg0_shape[arg0_k_idx];
+                if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
+                {
+                    std::vector<std::string> arg_vec{"arg0", "arg1"};
+                    std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
 
-                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                        << nnfusion::join(shape_vec) << " respectively, at Node "
-                                        << m_context->gnode->get_name()
-                                        << ", do not match for dot op";
+                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                            << nnfusion::join(shape_vec) << " respectively, at Node "
+                                            << m_context->gnode->get_name()
+                                            << ", do not match for dot op";
+                }
             }
-        }
-        // check and calculate m for arg0 and out
-        size_t arg0_m_idx = 0; // first axe in arg0 for m
-        size_t out_m_idx = 0;  // first axe in out for m
-        for (size_t i = 0; i < axes_for_m_count; i++)
-        {
-            m *= arg0_shape[arg0_m_idx];
-            if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
+            // check and calculate m for arg0 and out
+            size_t arg0_m_idx = 0; // first axe in arg0 for m
+            size_t out_m_idx = 0;  // first axe in out for m
+            for (size_t i = 0; i < axes_for_m_count; i++)
             {
-                std::vector<std::string> arg_vec{"arg0", "output"};
-                std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
+                m *= arg0_shape[arg0_m_idx];
+                if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
+                {
+                    std::vector<std::string> arg_vec{"arg0", "output"};
+                    std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
 
-                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                        << nnfusion::join(shape_vec) << " respectively, at Node "
-                                        << m_context->gnode->get_name()
-                                        << ", do not match for dot op";
+                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                            << nnfusion::join(shape_vec) << " respectively, at Node "
+                                            << m_context->gnode->get_name()
+                                            << ", do not match for dot op";
+                }
             }
-        }
-        // check and calculate n for arg1 and out
-        size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
-        size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
-        for (size_t i = 0; i < axes_for_n_count; i++)
-        {
-            n *= arg1_shape[arg1_n_idx];
-            if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
+            // check and calculate n for arg1 and out
+            size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
+            size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
+            for (size_t i = 0; i < axes_for_n_count; i++)
             {
-                std::vector<std::string> arg_vec{"arg1", "output"};
-                std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
+                n *= arg1_shape[arg1_n_idx];
+                if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
+                {
+                    std::vector<std::string> arg_vec{"arg1", "output"};
+                    std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
 
-                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                        << nnfusion::join(shape_vec) << " respectively, at Node "
-                                        << m_context->gnode->get_name()
-                                        << ", do not match for dot op";
+                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                            << nnfusion::join(shape_vec) << " respectively, at Node "
+                                            << m_context->gnode->get_name()
+                                            << ", do not match for dot op";
+                }
             }
-        }
-
-        lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
 
-        lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
-            << " CUBLAS_OP_N,"
-            << " CUBLAS_OP_N,"
-            << " " << n << ","
-            << " " << m << ","
-            << " " << k << ","
-            << " &alpha,"
-            << " static_cast<const half*>(input1),"
-            << " " << n << ","
-            << " static_cast<const half*>(input0),"
-            << " " << k << ","
-            << " &beta,"
-            << " static_cast<half*>(output0),"
-            << " " << n << "));\n";
+            lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
+
+            lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
+                << " CUBLAS_OP_N,"
+                << " CUBLAS_OP_N,"
+                << " " << n << ","
+                << " " << m << ","
+                << " " << k << ","
+                << " &alpha,"
+                << " static_cast<const half*>(input1),"
+                << " " << n << ","
+                << " static_cast<const half*>(input0),"
+                << " " << k << ","
+                << " &beta,"
+                << " static_cast<half*>(output0),"
+                << " " << n << "));\n";
+        // }
+        
     } else {
         NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot."
     }
diff --git a/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp b/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp
new file mode 100644
index 000000000..1779ad827
--- /dev/null
+++ b/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp
@@ -0,0 +1,533 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "graph_pass_base.hpp"
+#include "nnfusion/core/operators/generic_op/generic_op.hpp"
+#include "nnfusion/core/operators/op_define/constant.hpp"
+#include "nnfusion/engine/profiler/profiler.hpp"
+#include "nnfusion/util/curl_request.hpp"
+
+using namespace nnfusion::graph;
+
+DECLARE_string(fdefault_device);
+DECLARE_string(fantares_codegen_server);
+
+namespace nnfusion
+{
+    namespace pass
+    {
+        namespace graph
+        {
+            class DirectComputeCodegenPass : public GraphPassBase
+            {
+                std::string currentBackend;
+                std::string autogen(const std::string& expr)
+                {
+                    if (FLAGS_fantares_codegen_server == "")
+                        FLAGS_fantares_codegen_server = "10.150.145.98:8884";
+                    static std::unordered_map<std::string, std::string> code_cache;
+                    std::string response;
+                    auto it = code_cache.find(expr);
+                    if (it == code_cache.end())
+                    {
+                        CurlRequest req(FLAGS_fantares_codegen_server);
+                        req.add_custom_header(("COMPUTE_V1: " + expr).c_str());
+                        req.add_custom_header("ARGS: ");
+
+                        printf("[Autogen] %s\n", expr.c_str());
+                        NNFUSION_CHECK(true == req.send_request(response));
+                        NNFUSION_CHECK(strncmp(response.c_str(), "[ERROR]", 7) != 0) << expr << "\n"
+                                                                                     << response;
+                        code_cache[expr] = response;
+                        return std::move(response);
+                    }
+                    else
+                        return it->second;
+                }
+
+                template <class T1, class T2>
+                inline std::string
+                    join_collections(const T1& vect, T2 func, bool skip_empty = false)
+                {
+                    std::stringstream result;
+                    int idx = 0;
+                    for (auto& it : vect)
+                    {
+                        auto str = func(idx, it);
+                        if (!str.size() && skip_empty)
+                            continue;
+                        if (idx > 0)
+                            result << ", ";
+                        result << str;
+                        ++idx;
+                    }
+                    return result.str();
+                }
+
+                // inline int get_type_id(nnfusion::element::Type type)
+                // {
+                //     // TODO: fill more type cases
+                //     if (type == nnfusion::element::f32)
+                //         return DT_FLOAT;
+                //     throw std::runtime_error("Not supported element type.");
+                // }
+
+                template <class T>
+                inline std::shared_ptr<T> get_op_object(std::shared_ptr<GNode>& curr)
+                {
+                    auto _op = static_pointer_cast<T>(curr->get_op_ptr());
+                    NNFUSION_CHECK_NOT_NULLPTR(_op) << "Node type is not "
+                                                    << curr->get_op_ptr()->get_op_type();
+                    return _op;
+                }
+
+                inline void UNHANDLED_CASE(std::shared_ptr<GNode>& curr)
+                {
+                    printf("## Unhandled case for %s:\n",
+                           curr->get_op_ptr()->get_op_type().c_str());
+                    for (int i = 0; i < curr->get_input_size(); ++i)
+                        printf(">> in-%d : %s\n",
+                               i,
+                               vector_to_string(curr->get_input_shape(i)).c_str());
+                    for (int i = 0; i < curr->get_output_size(); ++i)
+                        printf(">> out-%d: %s\n",
+                               i,
+                               vector_to_string(curr->get_output_shape(i)).c_str());
+                    exit(1);
+                };
+
+            public:
+                bool run_on_graph(std::shared_ptr<Graph>& graph) override
+                {
+                    currentBackend = "dxcompute";
+
+                    NNFUSION_LOG(INFO) << "Codegen for " << currentBackend << " starts up.";
+
+                    auto nodes = graph->get_nodes();
+                    std::unordered_map<std::shared_ptr<GNode>, int> din, dout;
+
+                    // Count degrees
+                    for (auto& it : nodes)
+                    {
+                        for (auto& in_edge : it->get_in_edges())
+                        {
+                            if (in_edge->is_control_edge())
+                                continue;
+                            NNFUSION_CHECK(in_edge->get_dst() == it);
+                            din[it]++;
+                            dout[in_edge->get_src()]++;
+                        }
+                    }
+
+                    // Name nodes, legality checks
+                    std::unordered_set<std::shared_ptr<GNode>> visited, vis_pend, blacklist;
+                    std::unordered_set<std::string> name_used;
+                    std::unordered_map<std::shared_ptr<GNode>, std::string> arg_names;
+                    for (auto& it : nodes)
+                    {
+                        NNFUSION_CHECK(it.get() != nullptr);
+
+                        auto arg_name = "Z0_" + it->get_op_ptr()->get_op_type() + "_" +
+                                        it->get_op_ptr()->get_name();
+                        for (auto& c : arg_name)
+                            if (!isalpha(c) && !isdigit(c))
+                                c = '_';
+                        if (name_used.count(arg_name))
+                        {
+                            for (int i = 1;; ++i)
+                            {
+                                auto alter = arg_name + "_" + std::to_string(i);
+                                if (!name_used.count(alter))
+                                {
+                                    arg_name = alter;
+                                    break;
+                                }
+                            }
+                        }
+                        name_used.insert(arg_name);
+                        arg_names[it] = arg_name;
+
+                        if (din[it] == 0 && dout[it] == 0)
+                            visited.insert(it), blacklist.insert(it);
+                        NNFUSION_CHECK(it->get_output_size() == 1);
+                    }
+                    NNFUSION_LOG(INFO) << "There are " << blacklist.size()
+                                       << " standalone GNode(s) found.";
+                    name_used.clear();
+
+                    // Fill offsetup nodes
+                    std::deque<std::shared_ptr<GNode>> gen_q, pend_q;
+                    for (auto& it : nodes)
+                    {
+                        if (visited.count(it))
+                            continue;
+                        if (din[it] == 0)
+                        {
+                            gen_q.push_back(it);
+                        }
+                    }
+
+                    NNFUSION_CHECK(
+                        0 ==
+                        system(("mkdir -p nnfusion_rt/" + currentBackend + "_codegen").c_str()));
+
+                    std::ofstream fout("nnfusion_rt/" + currentBackend + "_codegen/nnfusion_rt.h");
+
+                    fout << "#if 1\n\n";
+                    // Perform blockfusion
+                    int offset = 0, step = 0;
+                    auto new_super_step = [&]() {
+                        while (pend_q.size())
+                        {
+                            gen_q.push_back(pend_q.front());
+                            pend_q.pop_front();
+                        }
+                        if (offset > 0)
+                            ++step, offset = 0;
+                    };
+
+                    auto print_standard_codegen = [&](std::shared_ptr<GNode>& curr,
+                                                      std::ofstream& fout,
+                                                      std::string ir,
+                                                      std::string options) {
+                        std::string code = autogen(ir);
+
+                        if (options.size() > 0)
+                        {
+                            if (options[0] != '|')
+                                options = "|" + options;
+                            if (options.back() != '|')
+                                options += "|";
+                        }
+
+                        if (int(options.find("|memcpy|")) >= 0)
+                        {
+                            NNFUSION_CHECK(curr->get_input_size() == 1);
+                            fout << "NNfusionTensor &" << arg_names[curr] << " = "
+                                 << arg_names[curr->get_in_edge(0)->get_src()] << ";\n";
+                            return;
+                        }
+
+                        static std::unordered_map<std::string, std::string> dedupe_kernels;
+                        auto kernel = dedupe_kernels.find(code);
+                        if (kernel == dedupe_kernels.end())
+                        {
+                            NNFUSION_CHECK(0 == system(("mkdir -p nnfusion_rt/" + currentBackend +
+                                                        "_codegen/HLSL")
+                                                           .c_str()));
+                            FILE* fp = fopen(("nnfusion_rt/" + currentBackend + "_codegen/HLSL/" +
+                                              arg_names[curr] + ".hlsl")
+                                                 .c_str(),
+                                             "wb");
+                            NNFUSION_CHECK(fp != nullptr);
+                            NNFUSION_CHECK(code.size() == fwrite(code.c_str(), 1, code.size(), fp));
+                            fclose(fp);
+                            dedupe_kernels[code] = arg_names[curr];
+                            kernel = dedupe_kernels.find(code);
+                        }
+
+                        fout << "// " << ir << "\n";
+                        if (int(options.find("|inplace_wg|")) < 0)
+                        {
+                            fout << "NNfusionTensor " << arg_names[curr] << "(device, {"
+                                 << join_collections(
+                                        curr->get_output_shape(0),
+                                        [](int idx, ssize_t it) { return std::to_string(it); })
+                                 << "}, sizeof(" << curr->get_output_element_type(0).c_type_string()
+                                 << "));\n";
+
+                            fout << "  NNfusionOperator op_" << arg_names[curr] << "(device, {";
+                            for (int i = 0; i < curr->get_input_size(); ++i)
+                            {
+                                if (i)
+                                    fout << ", ";
+                                fout << arg_names[curr->get_in_edge(i)->get_src()];
+                            }
+                            fout << "}, { " << arg_names[curr] << " }, L\"" << kernel->second
+                                 << ".hlsl\");";
+                        }
+                        else
+                        {
+                            fout << "  NNfusionOperator op_" << arg_names[curr] << "(device, {";
+                            for (int i = 0; i < curr->get_input_size(); ++i)
+                            {
+                                if (i)
+                                    fout << ", ";
+                                fout << arg_names[curr->get_in_edge(i)->get_src()];
+                            }
+                            fout << "}, { " << arg_names[curr->get_in_edge(0)->get_src()]
+                                 << " }, L\"" << kernel->second << ".hlsl\");\n";
+                            fout << "auto& " << arg_names[curr] << " = "
+                                 << arg_names[curr->get_in_edge(0)->get_src()] << ";";
+                        }
+                    };
+
+                    auto codegen_for_elementwise = [&](std::shared_ptr<GNode>& curr,
+                                                       std::ofstream& fout,
+                                                       const std::string& topi,
+                                                       const std::string& options = "") {
+                        std::string expr = " -";
+                        for (int i = 0; i < curr->get_input_size(); ++i)
+                            expr += " input(\"input" + std::to_string(i) + "\", @common_shape@);";
+                        expr += " output(@common_shape@, " + topi + ");";
+
+                        int num_elements = 1, y;
+                        for (auto& it : curr->get_input_shape(0))
+                            num_elements *= it;
+
+                        print_standard_codegen(
+                            curr,
+                            fout,
+                            op::create_code_from_template(
+                                expr,
+                                {{"common_shape", "[ " + std::to_string(num_elements) + " ]"}}),
+                            options);
+                    };
+
+                    std::unordered_map<std::string,
+                                       std::function<void(std::shared_ptr<GNode>&, std::ofstream&)>>
+                        kernel_dict;
+
+                    // Elementwise Ops
+                    kernel_dict["Subtract"] = [&](std::shared_ptr<GNode>& curr,
+                                                  std::ofstream& fout) {
+                        codegen_for_elementwise(
+                            curr, fout, "topi=topi.subtract(args(\"input0\"), args(\"input1\"))");
+                    };
+                    kernel_dict["Multiply"] = [&](std::shared_ptr<GNode>& curr,
+                                                  std::ofstream& fout) {
+                        codegen_for_elementwise(
+                            curr, fout, "topi=topi.multiply(args(\"input0\"), args(\"input1\"))");
+                    };
+                    kernel_dict["Divide"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
+                        codegen_for_elementwise(
+                            curr, fout, "topi=topi.divide(args(\"input0\"), args(\"input1\"))");
+                    };
+                    kernel_dict["DivNoNan"] = [&](std::shared_ptr<GNode>& curr,
+                                                  std::ofstream& fout) {
+                        codegen_for_elementwise(
+                            curr,
+                            fout,
+                            "lambda x: tvm.te.if_then_else(args(\"input1\")[x] != "
+                            "0, args(\"input0\")[x] / args(\"input1\")[x], 0)");
+                    };
+                    kernel_dict["Power"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
+                        codegen_for_elementwise(
+                            curr, fout, "topi=topi.power(args(\"input0\"), args(\"input1\"))");
+                    };
+                    kernel_dict["LessEq"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
+                        codegen_for_elementwise(
+                            curr, fout, "topi=topi.less_equal(args(\"input0\"), args(\"input1\"))");
+                    };
+                    kernel_dict["Equal"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
+                        codegen_for_elementwise(
+                            curr, fout, "topi=topi.equal(args(\"input0\"), args(\"input1\"))");
+                    };
+                    kernel_dict["Maximum"] = [&](std::shared_ptr<GNode>& curr,
+                                                 std::ofstream& fout) {
+                        codegen_for_elementwise(
+                            curr, fout, "topi=topi.maximum(args(\"input0\"), args(\"input1\"))");
+                    };
+                    kernel_dict["Exp"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
+                        codegen_for_elementwise(curr, fout, "topi=topi.exp(args(\"input0\"))");
+                    };
+                    kernel_dict["Negative"] = [&](std::shared_ptr<GNode>& curr,
+                                                  std::ofstream& fout) {
+                        codegen_for_elementwise(curr, fout, "topi=topi.negative(args(\"input0\"))");
+                    };
+                    kernel_dict["Tanh"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
+                        codegen_for_elementwise(curr, fout, "topi=topi.tanh(args(\"input0\"))");
+                    };
+                    kernel_dict["Relu6"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
+                        codegen_for_elementwise(
+                            curr, fout, "topi=topi.clip(args(\"input0\"), 0, 6)");
+                    };
+                    kernel_dict["Sigmoid"] = [&](std::shared_ptr<GNode>& curr,
+                                                 std::ofstream& fout) {
+                        codegen_for_elementwise(curr, fout, "topi=topi.sigmoid(args(\"input0\"))");
+                    };
+                    kernel_dict["Square"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
+                        codegen_for_elementwise(
+                            curr, fout, "topi=topi.multiply(args(\"input0\"), args(\"input0\"))");
+                    };
+                    kernel_dict["Rsqrt"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
+                        codegen_for_elementwise(curr, fout, "topi=topi.rsqrt(args(\"input0\"))");
+                    };
+                    kernel_dict["Log"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
+                        codegen_for_elementwise(curr, fout, "topi=topi.log(args(\"input0\"))");
+                    };
+                    kernel_dict["ReluBackprop"] = [&](std::shared_ptr<GNode>& curr,
+                                                      std::ofstream& fout) {
+                        codegen_for_elementwise(
+                            curr,
+                            fout,
+                            "lambda x: tvm.te.if_then_else(args(\"input0\")[x] > "
+                            "0, args(\"input1\")[x], 0)");
+                    };
+                    kernel_dict["Select"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
+                        codegen_for_elementwise(
+                            curr,
+                            fout,
+                            "lambda x: tvm.te.if_then_else(args(\"input0\")[x] == "
+                            "0, args(\"input2\")[x], args(\"input1\")[x])");
+                    };
+
+                    // Non-standard Ops
+                    kernel_dict["Constant"] = [&](std::shared_ptr<GNode>& curr,
+                                                  std::ofstream& fout) {
+                        auto p_const = std::dynamic_pointer_cast<op::Constant>(curr->get_op_ptr());
+                        NNFUSION_CHECK(p_const != nullptr);
+                        const void* dptr = p_const->get_data_ptr();
+                        size_t size = p_const->get_data_size();
+
+                        NNFUSION_CHECK(0 == system(("mkdir -p nnfusion_rt/" + currentBackend +
+                                                    "_codegen/Constant")
+                                                       .c_str()));
+                        FILE* fp = fopen(("nnfusion_rt/" + currentBackend + "_codegen/Constant/" +
+                                          arg_names[curr])
+                                             .c_str(),
+                                         "wb");
+                        NNFUSION_CHECK(fp != nullptr);
+                        NNFUSION_CHECK(size == fwrite(dptr, 1, size, fp));
+                        fclose(fp);
+
+                        fout << "NNfusionTensor " << arg_names[curr] << "(device, {"
+                             << join_collections(
+                                    curr->get_output_shape(0),
+                                    [](int idx, ssize_t it) { return std::to_string(it); })
+                             << "}, sizeof(" << curr->get_output_element_type(0).c_type_string()
+                             << "));\n";
+
+                        fout << "  NNfusionMemcpy op_" << arg_names[curr] << "(device, "
+                             << arg_names[curr] << ", load_data<"
+                             << curr->get_output_element_type(0).c_type_string() << ">(\""
+                             << arg_names[curr] << "\", " << arg_names[curr]
+                             << ".NumElements()), true);\n";
+                    };
+
+                    kernel_dict["Parameter"] = [&](std::shared_ptr<GNode>& curr,
+                                                   std::ofstream& fout) {
+                        fout << "NNfusionTensor " << arg_names[curr] << "(device, {"
+                             << join_collections(
+                                    curr->get_output_shape(0),
+                                    [](int idx, ssize_t it) { return std::to_string(it); })
+                             << "}, sizeof(" << curr->get_output_element_type(0).c_type_string()
+                             << "));\n";
+
+                        fout << "  NNfusionMemcpy op_" << arg_names[curr] << "(device, "
+                             << arg_names[curr] << ", load_data<"
+                             << curr->get_output_element_type(0).c_type_string() << ">(\"\", "
+                             << arg_names[curr] << ".NumElements()));\n";
+                    };
+
+                    kernel_dict["Result"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
+                        fout << "NNfusionMemcpy " << arg_names[curr] << "(device, nullptr, "
+                             << arg_names[curr->get_in_edge(0)->get_src()] << ");\n";
+                    };
+
+                    while (gen_q.size() > 0 || pend_q.size() > 0)
+                    {
+                        // Move to new super step if satisifed
+                        if (!gen_q.size())
+                            new_super_step();
+
+                        auto curr = gen_q.front();
+                        gen_q.pop_front();
+                        visited.insert(curr);
+
+                        auto entry = kernel_dict.find(curr->get_op_ptr()->get_op_type());
+                        if (entry != kernel_dict.end())
+                            entry->second(curr, fout);
+                        else
+                        {
+                            auto ir = nnfusion::op::get_translation_v2(curr);
+                            if (ir.empty())
+                                ir = nnfusion::op::get_translation(curr);
+                            if (ir != "")
+                            {
+                                const char annotation[] = "## @annotation: ";
+                                int pos = ir.find(annotation);
+                                std::string options;
+                                if (pos >= 0)
+                                {
+                                    pos += sizeof(annotation) - 1;
+                                    options = ir.substr(pos);
+                                }
+                                print_standard_codegen(curr, fout, ir, options);
+                            }
+                            else
+                                UNHANDLED_CASE(curr);
+                        }
+                        fout << std::endl;
+
+                        // Check its children about whether all inputs are ready (Must be put after any possible new_super_step())
+                        for (auto& edge : curr->get_out_edges())
+                        {
+                            if (edge->is_control_edge())
+                                continue;
+                            NNFUSION_CHECK(edge->get_src() == curr);
+                            NNFUSION_CHECK(visited.count(edge->get_dst()) == 0);
+
+                            bool ready = true;
+                            for (auto& from : edge->get_dst()->get_in_edges())
+                            {
+                                if (from->is_control_edge())
+                                    continue;
+                                if (visited.count(from->get_src()) == 0)
+                                {
+                                    ready = false;
+                                    break;
+                                }
+                            }
+                            if (ready)
+                            {
+                                // Only join pend_q once
+                                if (vis_pend.count(edge->get_dst()) == 0)
+                                {
+                                    vis_pend.insert(edge->get_dst());
+                                    pend_q.push_back(edge->get_dst());
+                                }
+                            }
+                        }
+                    }
+
+                    fout << "#endif\n\n";
+                    fout << R"(
+  device.pCommandQueue->ExecuteCommandLists(preloadQueue.size(), preloadQueue.data());
+  device.pCommandQueue->ExecuteCommandLists(cmdQueue.size(), cmdQueue.data());
+  device.AwaitExecution();
+)";
+                    // Print Results
+                    for (auto& curr : graph->get_outputs()) // Print output nodes
+                    {
+                        if (blacklist.count(curr))
+                            continue;
+                        fout << arg_names[curr] << ".PrintStageBuffer<"
+                             << curr->get_output_element_type(0).c_type_string() << ">(device, \""
+                             << arg_names[curr] << "\");\n";
+                    }
+
+                    fout << std::endl;
+
+                    nnfusion::codegen::copy_file_from_templates(
+                        currentBackend + "/DxCompute.vcxproj",
+                        "nnfusion_rt/" + currentBackend + "_codegen/DxCompute.vcxproj");
+                    nnfusion::codegen::copy_file_from_templates(currentBackend + "/run_graph.cpp",
+                                                                "nnfusion_rt/" + currentBackend +
+                                                                    "_codegen/run_graph.cpp");
+                    nnfusion::codegen::copy_file_from_templates(currentBackend + "/d3dx12_helper.h",
+                                                                "nnfusion_rt/" + currentBackend +
+                                                                    "_codegen/d3dx12_helper.h");
+                    nnfusion::codegen::copy_file_from_templates(
+                        currentBackend + "/d3dx12_nnfusion.h",
+                        "nnfusion_rt/" + currentBackend + "_codegen/d3dx12_nnfusion.h");
+                    NNFUSION_LOG(INFO) << currentBackend << " codegen finished.";
+                    exit(0);
+                    return true;
+                }
+            };
+        } // namespace pass
+    }     // namespace graph
+} // namespace nnfusion

From 64af7da37f8523f2046bc45a5647692f2b44ec45 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Fri, 27 Nov 2020 09:28:11 +0000
Subject: [PATCH 22/32] code sytle applied

---
 .../kernels/cuda_gpu/kernels/convolution.cpp  |   1 -
 .../core/kernels/cuda_gpu/kernels/dot.cpp     | 134 +++++++++---------
 .../core/kernels/kernel_registration.cpp      |   1 +
 3 files changed, 69 insertions(+), 67 deletions(-)

diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp
index 549d428c6..6b91e3956 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/convolution.cpp
@@ -87,7 +87,6 @@ LanguageUnit_p cuda::ConvolutionCudnn::emit_function_body()
         padding_below[i] = static_cast<size_t>(padding_below_diff[i]);
     }
 
-
     {
         // lu << "cudnnDataType_t data_type = " << get_cudnn_datatype(dtype) << ";\n";
         lu << cudnn_tensor_descriptor_from_shape(input_shape, "tensor_desc_0", input_type)
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
index 7838e4300..2b3a1543d 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
@@ -203,7 +203,9 @@ LanguageUnit_p cuda::Dot::emit_function_body()
                << " static_cast<float*>(output0),"
                << " " << n << "));\n";
         }
-    } else if (dtype == element::f16) {
+    }
+    else if (dtype == element::f16)
+    {
         // case 1: Scalar * Tensor
         // if (arg0_shape.empty() || arg1_shape.empty())
         // {
@@ -282,83 +284,83 @@ LanguageUnit_p cuda::Dot::emit_function_body()
         //     << " static_cast<half*>(output0),"
         //     << " " << m << "));\n";
         // } else {
-            size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
-            size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
-            size_t axes_for_k_count = reduction_axes;
-            size_t m = 1;
-            size_t n = 1;
-            size_t k = 1;
+        size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
+        size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
+        size_t axes_for_k_count = reduction_axes;
+        size_t m = 1;
+        size_t n = 1;
+        size_t k = 1;
 
-            // check if input and output size correct
-            // check and calculate k for arg0 and arg1
-            size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
-            size_t arg1_k_idx = 0;                // first axe in arg1 for k
+        // check if input and output size correct
+        // check and calculate k for arg0 and arg1
+        size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
+        size_t arg1_k_idx = 0;                // first axe in arg1 for k
 
-            for (size_t i = 0; i < axes_for_k_count; i++)
+        for (size_t i = 0; i < axes_for_k_count; i++)
+        {
+            k *= arg0_shape[arg0_k_idx];
+            if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
             {
-                k *= arg0_shape[arg0_k_idx];
-                if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
-                {
-                    std::vector<std::string> arg_vec{"arg0", "arg1"};
-                    std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
+                std::vector<std::string> arg_vec{"arg0", "arg1"};
+                std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
 
-                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                            << nnfusion::join(shape_vec) << " respectively, at Node "
-                                            << m_context->gnode->get_name()
-                                            << ", do not match for dot op";
-                }
+                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                      << nnfusion::join(shape_vec) << " respectively, at Node "
+                                      << m_context->gnode->get_name()
+                                      << ", do not match for dot op";
             }
-            // check and calculate m for arg0 and out
-            size_t arg0_m_idx = 0; // first axe in arg0 for m
-            size_t out_m_idx = 0;  // first axe in out for m
-            for (size_t i = 0; i < axes_for_m_count; i++)
+        }
+        // check and calculate m for arg0 and out
+        size_t arg0_m_idx = 0; // first axe in arg0 for m
+        size_t out_m_idx = 0;  // first axe in out for m
+        for (size_t i = 0; i < axes_for_m_count; i++)
+        {
+            m *= arg0_shape[arg0_m_idx];
+            if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
             {
-                m *= arg0_shape[arg0_m_idx];
-                if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
-                {
-                    std::vector<std::string> arg_vec{"arg0", "output"};
-                    std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
+                std::vector<std::string> arg_vec{"arg0", "output"};
+                std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
 
-                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                            << nnfusion::join(shape_vec) << " respectively, at Node "
-                                            << m_context->gnode->get_name()
-                                            << ", do not match for dot op";
-                }
+                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                      << nnfusion::join(shape_vec) << " respectively, at Node "
+                                      << m_context->gnode->get_name()
+                                      << ", do not match for dot op";
             }
-            // check and calculate n for arg1 and out
-            size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
-            size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
-            for (size_t i = 0; i < axes_for_n_count; i++)
+        }
+        // check and calculate n for arg1 and out
+        size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
+        size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
+        for (size_t i = 0; i < axes_for_n_count; i++)
+        {
+            n *= arg1_shape[arg1_n_idx];
+            if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
             {
-                n *= arg1_shape[arg1_n_idx];
-                if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
-                {
-                    std::vector<std::string> arg_vec{"arg1", "output"};
-                    std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
+                std::vector<std::string> arg_vec{"arg1", "output"};
+                std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
 
-                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                            << nnfusion::join(shape_vec) << " respectively, at Node "
-                                            << m_context->gnode->get_name()
-                                            << ", do not match for dot op";
-                }
+                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                      << nnfusion::join(shape_vec) << " respectively, at Node "
+                                      << m_context->gnode->get_name()
+                                      << ", do not match for dot op";
             }
+        }
 
-            lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
-
-            lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
-                << " CUBLAS_OP_N,"
-                << " CUBLAS_OP_N,"
-                << " " << n << ","
-                << " " << m << ","
-                << " " << k << ","
-                << " &alpha,"
-                << " static_cast<const half*>(input1),"
-                << " " << n << ","
-                << " static_cast<const half*>(input0),"
-                << " " << k << ","
-                << " &beta,"
-                << " static_cast<half*>(output0),"
-                << " " << n << "));\n";
+        lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
+
+        lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
+           << " CUBLAS_OP_N,"
+           << " CUBLAS_OP_N,"
+           << " " << n << ","
+           << " " << m << ","
+           << " " << k << ","
+           << " &alpha,"
+           << " static_cast<const half*>(input1),"
+           << " " << n << ","
+           << " static_cast<const half*>(input0),"
+           << " " << k << ","
+           << " &beta,"
+           << " static_cast<half*>(output0),"
+           << " " << n << "));\n";
         // }
         
     } else {
diff --git a/src/nnfusion/core/kernels/kernel_registration.cpp b/src/nnfusion/core/kernels/kernel_registration.cpp
index 9aba33df2..786c41d87 100644
--- a/src/nnfusion/core/kernels/kernel_registration.cpp
+++ b/src/nnfusion/core/kernels/kernel_registration.cpp
@@ -5,6 +5,7 @@
 #include "nnfusion/common/type/element_type.hpp"
 #include "nnfusion/util/util.hpp"
 #include "ngraph/src/nnfusion/common/type/element_type.hpp"
+#include "nnfusion/util/util.hpp"
 
 using namespace nnfusion;
 using namespace nnfusion::kernels;

From 3dfda190d56876df44902f757757804963572ec5 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Fri, 11 Dec 2020 14:02:15 +0800
Subject: [PATCH 23/32] fix DataBuffer

---
 src/nnfusion/frontend/onnx_import/util/graph_convert.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnfusion/frontend/onnx_import/util/graph_convert.cpp b/src/nnfusion/frontend/onnx_import/util/graph_convert.cpp
index 2c29954c2..dfa79661f 100644
--- a/src/nnfusion/frontend/onnx_import/util/graph_convert.cpp
+++ b/src/nnfusion/frontend/onnx_import/util/graph_convert.cpp
@@ -143,7 +143,7 @@ namespace nnfusion
                     onnx::ModelProto proto_without_init;
                     proto_without_init.CopyFrom(model_proto);
                     proto_without_init.mutable_graph()->mutable_initializer()->Clear();
-                    NNFUSION_LOG(INFO) << proto_without_init.DebugString();
+                    // NNFUSION_LOG(INFO) << proto_without_init.DebugString();
                 }
 
                 std::string

From 0bd8eba41722ea1e01c939f0f8bd2116f994fee6 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Fri, 11 Dec 2020 17:10:16 +0800
Subject: [PATCH 24/32] onnx changed to DataBuffer style import

---
 .../frontend/onnx_import/core/tensor.hpp      |  50 ++----
 .../frontend/onnx_import/util/util.cpp        | 144 ++++++++++++++----
 .../frontend/onnx_import/util/util.hpp        |   5 +-
 3 files changed, 133 insertions(+), 66 deletions(-)

diff --git a/src/nnfusion/frontend/onnx_import/core/tensor.hpp b/src/nnfusion/frontend/onnx_import/core/tensor.hpp
index 415abc080..85a624b14 100644
--- a/src/nnfusion/frontend/onnx_import/core/tensor.hpp
+++ b/src/nnfusion/frontend/onnx_import/core/tensor.hpp
@@ -22,6 +22,7 @@
 #pragma once
 
 #include "../util/util.hpp"
+#include "ngraph/src/nnfusion/common/type/data_buffer.hpp"
 
 namespace nnfusion
 {
@@ -55,50 +56,31 @@ namespace nnfusion
                     return detail::get_data<T>(*m_tensor_proto);
                 }
 
+                DataBuffer buffer_get_data() const
+                {
+                    return detail::buffer_get_data(*m_tensor_proto);
+                }
+
                 const std::string& get_name() const
                 {
                     NNFUSION_CHECK(m_tensor_proto->has_name()) << "tensor has no name specified.";
                     return m_tensor_proto->name();
                 }
 
-                const element::Type& get_ng_type() const
+                element::Type get_ng_type() const
                 {
                     NNFUSION_CHECK(m_tensor_proto->has_data_type())
                         << "tensor has no data type specified.";
 
-                    switch (m_tensor_proto->data_type())
-                    {
-                    case onnx::TensorProto_DataType::TensorProto_DataType_BOOL:
-                        return element::boolean;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT:
-                    case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16:
-                        return element::f32;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE:
-                        return element::f64;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_INT8: return element::i8;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_INT16:
-                        return element::i16;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_INT32:
-                        return element::i32;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_INT64:
-                        return element::i64;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_UINT8: return element::u8;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_UINT16:
-                        return element::u16;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_UINT32:
-                        return element::u32;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_UINT64:
-                        return element::u64;
-                    case onnx::TensorProto_DataType::TensorProto_DataType_UNDEFINED:
-                        NNFUSION_CHECK_FAIL() << "data type is not defined";
-                        break;
-                    default:
-                        NNFUSION_CHECK_FAIL()
-                            << "unsupported data type: "
-                            << onnx::TensorProto_DataType_Name(
-                                   onnx::TensorProto_DataType(m_tensor_proto->data_type()));
-                        break;
-                    }
+                    element::Type element_type;
+                    bool status;
+                    status = ONNXDataTypeToNNFusionElementType(
+                        static_cast<onnx::TensorProto_DataType>(m_tensor_proto->data_type()),
+                        &element_type);
+                    NNFUSION_CHECK(status) << "Data type not supported: "
+                                           << m_tensor_proto->data_type();
+
+                    return element_type;
                 }
 
                 operator onnx::TensorProto_DataType() const
diff --git a/src/nnfusion/frontend/onnx_import/util/util.cpp b/src/nnfusion/frontend/onnx_import/util/util.cpp
index d6f52653d..28a56620f 100644
--- a/src/nnfusion/frontend/onnx_import/util/util.cpp
+++ b/src/nnfusion/frontend/onnx_import/util/util.cpp
@@ -28,7 +28,7 @@ namespace nnfusion
     {
         namespace onnx_import
         {
-            bool ONNXDataTypeToNNFusionElementType(const onnx::TensorProto_DataType onnx_dt,
+            bool ONNXDataTypeToNNFusionElementType(onnx::TensorProto_DataType onnx_dt,
                                                    nnfusion::element::Type* nnfusion_et)
             {
                 switch (onnx_dt)
@@ -36,8 +36,10 @@ namespace nnfusion
                 case onnx::TensorProto_DataType::TensorProto_DataType_BOOL:
                     *nnfusion_et = element::boolean;
                     break;
-                case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT:
                 case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16:
+                    *nnfusion_et = element::f16;
+                    break;
+                case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT:
                     *nnfusion_et = element::f32;
                     break;
                 case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE:
@@ -86,35 +88,38 @@ namespace nnfusion
                                                            const Shape shape,
                                                            const Tensor& tensor)
             {
-                switch (onnx_et)
-                {
-                case onnx::TensorProto_DataType::TensorProto_DataType_BOOL:
-                    return make_constant_op<bool>(element::boolean, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT:
-                case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16:
-                    return make_constant_op<float>(element::f32, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE:
-                    return make_constant_op<double>(element::f64, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_INT8:
-                    return make_constant_op<int8_t>(element::i8, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_INT16:
-                    return make_constant_op<int16_t>(element::i16, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_INT32:
-                    return make_constant_op<int32_t>(element::i32, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_INT64:
-                    return make_constant_op<int64_t>(element::i64, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_UINT8:
-                    return make_constant_op<uint8_t>(element::u8, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_UINT16:
-                    return make_constant_op<uint16_t>(element::u16, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_UINT32:
-                    return make_constant_op<uint32_t>(element::u32, shape, tensor);
-                case onnx::TensorProto_DataType::TensorProto_DataType_UINT64:
-                    return make_constant_op<uint64_t>(element::u64, shape, tensor);
-                default:
-                    NNFUSION_CHECK_FAIL() << "unsupported value info element type: "
-                                          << onnx::TensorProto_DataType_Name(onnx_et);
-                }
+                element::Type element_type = tensor.get_ng_type();
+                return std::make_shared<op::Constant>(
+                    element_type, shape, tensor.buffer_get_data());
+                // switch (onnx_et)
+                // {
+                // case onnx::TensorProto_DataType::TensorProto_DataType_BOOL:
+                //     return make_constant_op<bool>(element::boolean, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT:
+                // case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16:
+                //     return make_constant_op<float>(element::f32, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE:
+                //     return make_constant_op<double>(element::f64, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_INT8:
+                //     return make_constant_op<int8_t>(element::i8, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_INT16:
+                //     return make_constant_op<int16_t>(element::i16, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_INT32:
+                //     return make_constant_op<int32_t>(element::i32, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_INT64:
+                //     return make_constant_op<int64_t>(element::i64, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_UINT8:
+                //     return make_constant_op<uint8_t>(element::u8, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_UINT16:
+                //     return make_constant_op<uint16_t>(element::u16, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_UINT32:
+                //     return make_constant_op<uint32_t>(element::u32, shape, tensor);
+                // case onnx::TensorProto_DataType::TensorProto_DataType_UINT64:
+                //     return make_constant_op<uint64_t>(element::u64, shape, tensor);
+                // default:
+                //     NNFUSION_CHECK_FAIL() << "unsupported value info element type: "
+                //                           << onnx::TensorProto_DataType_Name(onnx_et);
+                // }
             }
 
             std::shared_ptr<graph::GNode> GetInputNode(const NodeMap& all_ng_nodes,
@@ -280,6 +285,83 @@ namespace nnfusion
                     name, std::vector<std::size_t>(kernel_shape.size(), 1UL));
             }
 
+            DataBuffer detail::buffer_get_data(const onnx::TensorProto& tensor)
+            {
+                size_t n_element = 1;
+                element::Type type;
+                bool status;
+                auto onnx_dt = static_cast<onnx::TensorProto_DataType>(tensor.data_type());
+
+                status = ONNXDataTypeToNNFusionElementType(onnx_dt, &type);
+
+                NNFUSION_CHECK(status) << "Unsupported ONNX data_type " << tensor.data_type()
+                                       << " is found";
+
+                DataBuffer buf(type);
+
+                for (auto dim : tensor.dims())
+                {
+                    n_element *= dim;
+                }
+                buf.resize(n_element);
+
+                if (tensor.has_raw_data())
+                {
+                    buf.load(tensor.raw_data().data(), n_element);
+                }
+                else
+                {
+#define GET_VALUE(pb_type, mid_type)                                                               \
+    do                                                                                             \
+    {                                                                                              \
+        const void* dat;                                                                           \
+        mid_type m;                                                                                \
+        NNFUSION_CHECK(n_element == tensor.pb_type##_data_size())                                  \
+            << "Tensor shape is not the same with tensor data_size. (" << n_element                \
+            << " != " << tensor.pb_type##_data_size() << ")";                                      \
+        for (size_t i = 0; i < n_element; ++i)                                                     \
+        {                                                                                          \
+            m = static_cast<mid_type>(tensor.pb_type##_data()[i]);                                 \
+            dat = reinterpret_cast<const void*>(&m);                                               \
+            buf.setElement(i, dat);                                                                \
+        }                                                                                          \
+    } while (0)
+
+                    switch (onnx_dt)
+                    {
+                    case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16:
+                        GET_VALUE(int32, element::half);
+                        break;
+                    case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT:
+                        GET_VALUE(float, float);
+                        break;
+                    case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE:
+                        GET_VALUE(double, double);
+                        break;
+                    case onnx::TensorProto_DataType::TensorProto_DataType_INT32:
+                        GET_VALUE(int32, int32_t);
+                        break;
+                    case onnx::TensorProto_DataType::TensorProto_DataType_INT64:
+                        GET_VALUE(int64, int64_t);
+                        break;
+                    case onnx::TensorProto_DataType::TensorProto_DataType_UINT64:
+                        GET_VALUE(uint64, uint64_t);
+                        break;
+                    case onnx::TensorProto_DataType::TensorProto_DataType_UINT32:
+                    case onnx::TensorProto_DataType::TensorProto_DataType_BOOL:
+                    case onnx::TensorProto_DataType::TensorProto_DataType_INT16:
+                    case onnx::TensorProto_DataType::TensorProto_DataType_INT8:
+                    case onnx::TensorProto_DataType::TensorProto_DataType_UINT8:
+                    case onnx::TensorProto_DataType::TensorProto_DataType_UINT16:
+                    default:
+                        NNFUSION_CHECK_FAIL() << "unsupported onnx element type: "
+                                              << onnx::TensorProto_DataType_Name(onnx_dt);
+                    }
+#undef GET_VALUE
+                }
+                return buf;
+            }
+
         } // namespace onnx_import
     }     // namespace frontend
 } // namespace nnfusion
diff --git a/src/nnfusion/frontend/onnx_import/util/util.hpp b/src/nnfusion/frontend/onnx_import/util/util.hpp
index 871ab4801..31bf27726 100644
--- a/src/nnfusion/frontend/onnx_import/util/util.hpp
+++ b/src/nnfusion/frontend/onnx_import/util/util.hpp
@@ -28,6 +28,7 @@
 #include <vector>
 
 #include "../onnx_base.hpp"
+#include "ngraph/src/nnfusion/common/type/data_buffer.hpp"
 #include "nnfusion/common/common.hpp"
 
 namespace nnfusion
@@ -51,6 +52,8 @@ namespace nnfusion
                     return {it, it + (raw_data.size() / sizeof(T))};
                 }
 
+                DataBuffer buffer_get_data(const onnx::TensorProto& tensor);
+
                 template <typename T>
                 inline std::vector<T> get_data(const onnx::TensorProto& tensor)
                 {
@@ -186,7 +189,7 @@ namespace nnfusion
             class Tensor;
             class Node;
 
-            bool ONNXDataTypeToNNFusionElementType(const onnx::TensorProto_DataType onnx_dt,
+            bool ONNXDataTypeToNNFusionElementType(onnx::TensorProto_DataType onnx_dt,
                                                    nnfusion::element::Type* nnfusion_et);
 
             template <typename T>

From 9cefad53f66ed3c324c79a78cbbbfaf6c81facbf Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Thu, 17 Dec 2020 11:22:50 +0800
Subject: [PATCH 25/32] fix onnx fp16

---
 src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp | 4 ++--
 src/nnfusion/frontend/onnx_import/op/constant.hpp     | 7 +++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp b/src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp
index 76d40424d..42c782ae4 100644
--- a/src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp
+++ b/src/nnfusion/engine/pass/graph/dot_transpose_pass.hpp
@@ -25,6 +25,6 @@ namespace nnfusion
             public:
                 bool run_on_graph(std::shared_ptr<nnfusion::graph::Graph>& graph) override;
             };
-        } // namespace pass
-    }     // namespace graph
+        } // namespace graph
+    }     // namespace pass
 } // namespace nnfusion
diff --git a/src/nnfusion/frontend/onnx_import/op/constant.hpp b/src/nnfusion/frontend/onnx_import/op/constant.hpp
index f3dd0bfc9..4280f9f86 100644
--- a/src/nnfusion/frontend/onnx_import/op/constant.hpp
+++ b/src/nnfusion/frontend/onnx_import/op/constant.hpp
@@ -65,8 +65,11 @@ namespace nnfusion
                     Node node(node_proto);
                     auto tensor = node.get_attribute_value<Tensor>("value");
 
-                    const auto& func_param = ONNX_CONST_MAP().at(tensor.get_ng_type());
-                    auto op = func_param(tensor.get_ng_type(), tensor);
+                    // const auto& func_param = ONNX_CONST_MAP().at(tensor.get_ng_type());
+                    // auto op = func_param(tensor.get_ng_type(), tensor);
+                    auto op = std::make_shared<op::Constant>(
+                        tensor.get_ng_type(), tensor.get_shape(), tensor.buffer_get_data()
+                    );
 
                     op->set_name(node_proto.output(0));
                     auto gnode = m_graph->add_node_and_edge(op, graph::GNodeVector({}));

From 6ded1628d9febdfdd8003d7b2ec11f992f3a2b27 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Tue, 29 Dec 2020 13:34:29 +0800
Subject: [PATCH 26/32] bert l1 runnable

---
 .../core/kernels/cuda_gpu/cuda_helper.cpp     |   5 +
 .../kernels/cuda_gpu/kernels/batch_matmul.cpp |  18 ++-
 .../core/kernels/cuda_gpu/kernels/dot.cpp     | 147 +++++++++---------
 .../core/kernels/cuda_gpu/kernels/reduce.hpp  |   2 +-
 .../core/kernels/cuda_gpu/kernels/softmax.cpp |   3 +-
 .../engine/pass/graph/kernel_tuning.cpp       |   6 +
 .../frontend/onnx_import/core/tensor.hpp      |   2 +-
 .../frontend/onnx_import/op/constant.hpp      |   3 +-
 .../frontend/onnx_import/util/util.hpp        |   2 +-
 9 files changed, 104 insertions(+), 84 deletions(-)

diff --git a/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp b/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp
index c1809a5cb..54a73cfb1 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/cuda_helper.cpp
@@ -33,6 +33,11 @@ LanguageUnit_p cuda::get_math_kernel(const std::string& name,
         writer << ")\n";
         writer << "{\n";
         writer.indent++;
+        if (name == "convert" && data_types[num_inputs] == "half" && data_types[0] == "int64_t")
+        {
+            writer << "return (long long)" + math_kernel << ";\n";
+        }
+        else
         {
             writer << "return " + math_kernel << ";\n";
         }
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp
index 173e95e93..c42d7780c 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/batch_matmul.cpp
@@ -8,6 +8,7 @@
 //   [a] ./new_kernel_0.cpp
 //   [b] ../../../ops/op_define/new_op_0.cpp
 
+#include <string>
 #include "../cuda_emitter.hpp"
 #include "../cuda_langunit.hpp"
 #include "nnfusion/core/operators/generic_op/generic_op.hpp"
@@ -52,6 +53,15 @@ namespace nnfusion
                     const nnfusion::Shape& input_shape_0 = m_context->inputs[0]->get_shape();
                     const nnfusion::Shape& input_shape_1 = m_context->inputs[1]->get_shape();
 
+                    element::Type dtype0 = m_context->inputs[0]->get_element_type();
+                    element::Type dtype1 = m_context->inputs[1]->get_element_type();
+                    element::Type dtype2 = m_context->outputs[0]->get_element_type();
+                    NNFUSION_CHECK(dtype0 == dtype1 && dtype1 == dtype2)
+                        << "Unsupported element type combination of (" << dtype0.c_type_string()
+                        << ", " << dtype1.c_type_string() << ") -> " << dtype2.c_type_string()
+                        << ".";
+                    element::Type& dtype = dtype0;
+
                     bool transA = generic_op->localOpConfig.getRoot()["adj_x"]["b"];
                     bool transB = generic_op->localOpConfig.getRoot()["adj_y"]["b"];
                     size_t A1 = 1LU;
@@ -92,10 +102,11 @@ namespace nnfusion
                         stride_b = A2 * A3, ldc = A4, stride_c = A2 * A4;
                     }
 
+                    std::string type = dtype.c_type_string();
                     float alpha = 1.0f, beta = 0.0f;
                     auto code = nnfusion::op::create_code_from_template(
                         R"(
-                        static const float alpha = @alpha@F, beta = @beta@F;
+                        static const @dtype@ alpha = @alpha@, beta = @beta@;
                         // if (!@hCublas@)
                         //     CUBLAS_SAFE_CALL(@api_create@(&@hCublas@));
                         CUBLAS_SAFE_CALL(@api_exec@(
@@ -106,7 +117,9 @@ namespace nnfusion
                         {
                             {"hCublas", "cublas_handle"},
                             {"api_create", "cublasCreate"},
-                            {"api_exec", "cublasSgemmStridedBatched"},
+                            {"api_exec",
+                             dtype == element::f32 ? "cublasSgemmStridedBatched"
+                                                   : "cublasHgemmStridedBatched"},
                             {"transA", transB ? "CUBLAS_OP_T" : "CUBLAS_OP_N"},
                             {"transB", transA ? "CUBLAS_OP_T" : "CUBLAS_OP_N"},
                             {"alpha", alpha},
@@ -121,6 +134,7 @@ namespace nnfusion
                             {"stride_b", stride_b},
                             {"stride_c", stride_c},
                             {"batch", A1},
+                            {"dtype", type},
                         });
 
                     LanguageUnit_p _lu(new LanguageUnit(get_function_name()));
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
index 2b3a1543d..8b4856c53 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
@@ -86,7 +86,7 @@ LanguageUnit_p cuda::Dot::emit_function_body()
         // matrix * vector
         else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1))
         {
-            lu << "const float alpha = 1.0;\n const float beta = 0;\n";
+            lu << "const float alpha = 1.0;\n const float beta = 0.;\n";
             lu << "CUBLAS_SAFE_CALL(cublasSgemv(cublas_handle, ";
             if (trans_A)
                 lu << "CUBLAS_OP_N, " << arg0_shape[0] << ", " << arg0_shape[1] << ", ";
@@ -107,7 +107,7 @@ LanguageUnit_p cuda::Dot::emit_function_body()
             int n = trans_A ? arg0_shape[1] : arg0_shape[0];
             int k = trans_A ? arg0_shape[0] : arg0_shape[1];
 
-            lu << "const float alpha = 1.0;\nconst float beta = 0;\n";
+            lu << "const float alpha = 1.0;\nconst float beta = 0.;\n";
 
             lu << "CUBLAS_SAFE_CALL(cublasSgemm(cublas_handle,"
                << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,")
@@ -186,7 +186,7 @@ LanguageUnit_p cuda::Dot::emit_function_body()
                 }
             }
 
-            lu << "const float alpha = 1.0;\nconst float beta = 0;\n";
+            lu << "const float alpha = 1.0;\nconst float beta = 0.;\n";
 
             lu << "CUBLAS_SAFE_CALL(cublasSgemm(cublas_handle,"
                << " CUBLAS_OP_N,"
@@ -261,89 +261,84 @@ LanguageUnit_p cuda::Dot::emit_function_body()
         //     << " static_cast<float*>(output0),"
         //     << " 1));\n";
         // }
-        // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) &&
-        //         (trans_A || trans_B))
-        // {
-        //     int m = trans_B ? arg1_shape[0] : arg1_shape[1];
-        //     int n = trans_A ? arg0_shape[1] : arg0_shape[0];
-        //     int k = trans_A ? arg0_shape[0] : arg0_shape[1];
+        if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) &&
+            (trans_A || trans_B))
+        {
+            int m = trans_B ? arg1_shape[0] : arg1_shape[1];
+            int n = trans_A ? arg0_shape[1] : arg0_shape[0];
+            int k = trans_A ? arg0_shape[0] : arg0_shape[1];
 
-        //     lu << "const half alpha = 1.0;\nconst half beta = 0;\n";
+            lu << "const half alpha = 1.0;\nconst half beta = 0.;\n";
 
-        //     lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
-        //     << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,")
-        //     << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << ","
-        //     << " " << n << ","
-        //     << " " << k << ","
-        //     << " &alpha,"
-        //     << " static_cast<const half*>(input1),"
-        //     << " " << arg1_shape[1] << ","
-        //     << " static_cast<const half*>(input0),"
-        //     << " " << arg0_shape[1] << ","
-        //     << " &beta,"
-        //     << " static_cast<half*>(output0),"
-        //     << " " << m << "));\n";
-        // } else {
-        size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
-        size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
-        size_t axes_for_k_count = reduction_axes;
-        size_t m = 1;
-        size_t n = 1;
-        size_t k = 1;
+            lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
+               << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,")
+               << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << ","
+               << " " << n << ","
+               << " " << k << ","
+               << " &alpha,"
+               << " static_cast<const half*>(input1),"
+               << " " << arg1_shape[1] << ","
+               << " static_cast<const half*>(input0),"
+               << " " << arg0_shape[1] << ","
+               << " &beta,"
+               << " static_cast<half*>(output0),"
+               << " " << m << "));\n";
+        }
+        else
+        {
+            size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
+            size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
+            size_t axes_for_k_count = reduction_axes;
+            size_t m = 1;
+            size_t n = 1;
+            size_t k = 1;
 
-        // check if input and output size correct
-        // check and calculate k for arg0 and arg1
-        size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
-        size_t arg1_k_idx = 0;                // first axe in arg1 for k
+            // check if input and output size correct
+            // check and calculate k for arg0 and arg1
+            size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
+            size_t arg1_k_idx = 0;                // first axe in arg1 for k
 
-        for (size_t i = 0; i < axes_for_k_count; i++)
-        {
-            k *= arg0_shape[arg0_k_idx];
-            if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
+            for (size_t i = 0; i < axes_for_k_count; i++)
             {
-                std::vector<std::string> arg_vec{"arg0", "arg1"};
-                std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
+                k *= arg0_shape[arg0_k_idx];
+                if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
+                {
+                    std::vector<std::string> arg_vec{"arg0", "arg1"};
+                    std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
 
-                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                      << nnfusion::join(shape_vec) << " respectively, at Node "
-                                      << m_context->gnode->get_name()
-                                      << ", do not match for dot op";
+                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                          << nnfusion::join(shape_vec) << " respectively, at Node "
+                                          << m_context->gnode->get_name()
+                                          << ", do not match for dot op";
+                }
             }
-        }
-        // check and calculate m for arg0 and out
-        size_t arg0_m_idx = 0; // first axe in arg0 for m
-        size_t out_m_idx = 0;  // first axe in out for m
-        for (size_t i = 0; i < axes_for_m_count; i++)
-        {
-            m *= arg0_shape[arg0_m_idx];
-            if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
+            // check and calculate m for arg0 and out
+            size_t arg0_m_idx = 0; // first axe in arg0 for m
+            size_t out_m_idx = 0;  // first axe in out for m
+            for (size_t i = 0; i < axes_for_m_count; i++)
             {
-                std::vector<std::string> arg_vec{"arg0", "output"};
-                std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
+                m *= arg0_shape[arg0_m_idx];
+                if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
+                {
+                    std::vector<std::string> arg_vec{"arg0", "output"};
+                    std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
 
-                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                      << nnfusion::join(shape_vec) << " respectively, at Node "
-                                      << m_context->gnode->get_name()
-                                      << ", do not match for dot op";
+                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                          << nnfusion::join(shape_vec) << " respectively, at Node "
+                                          << m_context->gnode->get_name()
+                                          << ", do not match for dot op";
+                }
             }
-        }
-        // check and calculate n for arg1 and out
-        size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
-        size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
-        for (size_t i = 0; i < axes_for_n_count; i++)
-        {
-            n *= arg1_shape[arg1_n_idx];
-            if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
+            // check and calculate n for arg1 and out
+            size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
+            size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
+            for (size_t i = 0; i < axes_for_n_count; i++)
             {
-                std::vector<std::string> arg_vec{"arg1", "output"};
-                std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
-
-                NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                      << nnfusion::join(shape_vec) << " respectively, at Node "
-                                      << m_context->gnode->get_name()
-                                      << ", do not match for dot op";
-            }
-        }
+                n *= arg1_shape[arg1_n_idx];
+                if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
+                {
+                    std::vector<std::string> arg_vec{"arg1", "output"};
+                    std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
 
         lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
 
@@ -527,7 +522,7 @@ LanguageUnit_p cuda::Dot::emit_function_body()
     }
     else
     {
-        NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot.";
+        NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for kernel dot.";
     }
     //lu.block_end();
     return _lu;
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.hpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.hpp
index 97353e5e8..c9bfb3c26 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.hpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/reduce.hpp
@@ -189,7 +189,7 @@ int data_idx_offset = block_idx * width;
 float val = 0.0;
 for (int tidx = thread_idx; tidx < width; tidx += block_size) {
     int data_idx = tidx + data_idx_offset;
-    val += input0[data_idx];
+    val += static_cast<float>(input0[data_idx]);
 }
 val = reduceSum(val, thread_idx, block_size, shm);
 if (thread_idx == 0) output0[block_idx] = val;
diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp
index 4f5bfa067..b83e9a832 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/softmax.cpp
@@ -203,7 +203,8 @@ LanguageUnit_p
 {
     LanguageUnit_p _lu(new LanguageUnit);
     auto& lu = *_lu;
-    string data_type = "CUDNN_DATA_FLOAT"; //cuda::get_cudnn_datatype(type);
+    element::Type type = m_context->inputs[0]->get_element_type();
+    string data_type = cuda::get_cudnn_datatype(type);
     string tensor_format = "CUDNN_TENSOR_NCHW";
     lu << "cudnnTensorDescriptor_t " << desc << ";\n";
     lu << "CUDNN_SAFE_CALL(cudnnCreateTensorDescriptor(&" << desc << "));\n";
diff --git a/src/nnfusion/engine/pass/graph/kernel_tuning.cpp b/src/nnfusion/engine/pass/graph/kernel_tuning.cpp
index 068f7c904..751c5dc40 100644
--- a/src/nnfusion/engine/pass/graph/kernel_tuning.cpp
+++ b/src/nnfusion/engine/pass/graph/kernel_tuning.cpp
@@ -91,6 +91,12 @@ void print_tuning_results(std::vector<std::shared_ptr<TuningStatus>> tuned_kerne
            << std::setw(10) << s->status << " | " << std::setw(6) << s->progress_step << "/"
            << FLAGS_fkernel_tuning_steps << " "
            << " | " << std::setw(12) << s->best_perf << " ms |\n";
+
+        if (fabs(s->best_perf + 1.0) < 1e-5)
+        {
+            NNFUSION_LOG(INFO) << "Kernel named \"" << s->op_name << "\" has not yet been tuned.\n"
+                               << s->ir;
+        }
     }
     NNFUSION_LOG(INFO) << ss.str();
 }
diff --git a/src/nnfusion/frontend/onnx_import/core/tensor.hpp b/src/nnfusion/frontend/onnx_import/core/tensor.hpp
index 85a624b14..201aa580d 100644
--- a/src/nnfusion/frontend/onnx_import/core/tensor.hpp
+++ b/src/nnfusion/frontend/onnx_import/core/tensor.hpp
@@ -22,7 +22,7 @@
 #pragma once
 
 #include "../util/util.hpp"
-#include "ngraph/src/nnfusion/common/type/data_buffer.hpp"
+#include "nnfusion/common/type/data_buffer.hpp"
 
 namespace nnfusion
 {
diff --git a/src/nnfusion/frontend/onnx_import/op/constant.hpp b/src/nnfusion/frontend/onnx_import/op/constant.hpp
index 4280f9f86..87163617d 100644
--- a/src/nnfusion/frontend/onnx_import/op/constant.hpp
+++ b/src/nnfusion/frontend/onnx_import/op/constant.hpp
@@ -68,8 +68,7 @@ namespace nnfusion
                     // const auto& func_param = ONNX_CONST_MAP().at(tensor.get_ng_type());
                     // auto op = func_param(tensor.get_ng_type(), tensor);
                     auto op = std::make_shared<op::Constant>(
-                        tensor.get_ng_type(), tensor.get_shape(), tensor.buffer_get_data()
-                    );
+                        tensor.get_ng_type(), tensor.get_shape(), tensor.buffer_get_data());
 
                     op->set_name(node_proto.output(0));
                     auto gnode = m_graph->add_node_and_edge(op, graph::GNodeVector({}));
diff --git a/src/nnfusion/frontend/onnx_import/util/util.hpp b/src/nnfusion/frontend/onnx_import/util/util.hpp
index 31bf27726..2f7262842 100644
--- a/src/nnfusion/frontend/onnx_import/util/util.hpp
+++ b/src/nnfusion/frontend/onnx_import/util/util.hpp
@@ -28,8 +28,8 @@
 #include <vector>
 
 #include "../onnx_base.hpp"
-#include "ngraph/src/nnfusion/common/type/data_buffer.hpp"
 #include "nnfusion/common/common.hpp"
+#include "nnfusion/common/type/data_buffer.hpp"
 
 namespace nnfusion
 {

From 66abc20a97357bf7c3e9ba78e2327c467a0b06da Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Tue, 5 Jan 2021 17:07:37 +0800
Subject: [PATCH 27/32] change priority of evaluator runtime

---
 src/nnfusion/frontend/util/evaluator.hpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/nnfusion/frontend/util/evaluator.hpp b/src/nnfusion/frontend/util/evaluator.hpp
index 46b257922..23d9bc7b8 100644
--- a/src/nnfusion/frontend/util/evaluator.hpp
+++ b/src/nnfusion/frontend/util/evaluator.hpp
@@ -105,21 +105,21 @@ namespace nnfusion
                 nnfusion::profiler::IProfilingRuntime::Pointer runtime = nullptr;
                 std::vector<shared_ptr<const KernelRegistration>> kernel_regs;
 
-                runtime = nnfusion::profiler::RocmDefaultRuntime::Runtime();
+                runtime = nnfusion::profiler::CudaDefaultRuntime::Runtime();
                 if (runtime->check_env())
                 {
                     kernel_regs = KernelRegistry::Global()->FindKernelRegistrations(
-                        gnode->get_op_type(), ROCM_GPU, element::f32);
-                    if (kernel_regs.size() == 0)
-                        kernel_regs = KernelRegistry::Global()->FindKernelRegistrations(
-                            gnode->get_op_type(), CUDA_GPU, element::f32);
+                        gnode->get_op_type(), CUDA_GPU, element::f32);
                 }
                 else
                 {
-                    runtime = nnfusion::profiler::CudaDefaultRuntime::Runtime();
+                    runtime = nnfusion::profiler::RocmDefaultRuntime::Runtime();
                     NNFUSION_CHECK(runtime->check_env());
                     kernel_regs = KernelRegistry::Global()->FindKernelRegistrations(
-                        gnode->get_op_type(), CUDA_GPU, element::f32);
+                        gnode->get_op_type(), ROCM_GPU, element::f32);
+                    if (kernel_regs.size() == 0)
+                        kernel_regs = KernelRegistry::Global()->FindKernelRegistrations(
+                            gnode->get_op_type(), CUDA_GPU, element::f32);
                 }
 
                 bool const_infer_success = false;

From f4598b7651ace0561ebdad4aa3ee1da9f5e5f54e Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Wed, 10 Mar 2021 15:08:56 +0800
Subject: [PATCH 28/32] delete comments

---
 .../core/kernels/cuda_gpu/kernels/dot.cpp     | 254 ---------
 .../pass/graph/codegen_dxcompute_pass.hpp     | 533 ------------------
 .../frontend/onnx_import/util/util.cpp        |  29 -
 3 files changed, 816 deletions(-)
 delete mode 100644 src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp

diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
index b9d90dbbd..712da73c5 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
@@ -206,61 +206,6 @@ LanguageUnit_p cuda::Dot::emit_function_body()
     }
     else if (dtype == element::f16)
     {
-        // case 1: Scalar * Tensor
-        // if (arg0_shape.empty() || arg1_shape.empty())
-        // {
-        //     auto& second = (arg0_shape.empty() ? arg1_shape : arg0_shape);
-        //     size_t count = nnfusion::shape_size(second);
-
-        //     string firstarg = (arg0_shape.empty() ? "input1" : "input0");
-        //     string secondarg = (arg0_shape.empty() ? "input0" : "input1");
-
-        //     lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
-
-        //     lu << "CUDA_SAFE_CALL(cudaMemcpy(outupt0, " << firstarg << ", " << count << ", cudaMemcpyDeviceToDevice));\n";     // copy `firstarg` to `output0`
-        //     lu << "CUBLAS_SAFE_CALL(nnfusionHalfScale(" << secondarg << ", output0, " << count << "));\n";
-        // }
-        // // case 2: 1d Dot
-        // else if ((arg0_shape.size() == arg1_shape.size()) && (arg0_shape.size() == reduction_axes))
-        // {
-        //     for (int i = 0; i < arg0_shape.size(); i++)
-        //     {
-        //         if (arg0_shape[i] != arg1_shape[i])
-        //         {
-        //             std::vector<std::string> arg_vec{"arg0", "arg1"};
-        //             std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
-
-        //             NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-        //                                 << nnfusion::join(shape_vec) << " respectively, at Node "
-        //                                 << m_context->gnode->get_name()
-        //                                 << ", do not match for dot op";
-        //         }
-        //     }
-
-        //     size_t count = nnfusion::shape_size(arg0_shape);
-        //     lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
-
-        //     lu << "CUBLAS_SAFE_CALL(cublasSdot(cublas_handle, " << count
-        //     << ", static_cast<const float*>(input0), 1, static_cast<const float*>(input1), 1, "
-        //         "static_cast<float*>(output0)));\n";
-        // }
-        // // matrix * vector
-        // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1))
-        // {
-        //     lu << "const float alpha = 1.0;\n const float beta = 0;\n";
-        //     lu << "CUBLAS_SAFE_CALL(cublasSgemv(cublas_handle, ";
-        //     if (trans_A)
-        //         lu << "CUBLAS_OP_N, " << arg0_shape[0] << ", " << arg0_shape[1] << ", ";
-        //     else
-        //         lu << "CUBLAS_OP_T, " << arg0_shape[1] << ", " << arg0_shape[0] << ", ";
-        //     lu << " &alpha,"
-        //     << " static_cast<const float*>(input0)," << arg0_shape[1] << ", "
-        //     << " static_cast<const float*>(input1),"
-        //     << " 1,"
-        //     << " &beta,"
-        //     << " static_cast<float*>(output0),"
-        //     << " 1));\n";
-        // }
         if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) &&
             (trans_A || trans_B))
         {
@@ -361,205 +306,6 @@ LanguageUnit_p cuda::Dot::emit_function_body()
     } else {
         NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot."
     }
-    else if (dtype == element::f16)
-    {
-        // case 1: Scalar * Tensor
-        // if (arg0_shape.empty() || arg1_shape.empty())
-        // {
-        //     auto& second = (arg0_shape.empty() ? arg1_shape : arg0_shape);
-        //     size_t count = nnfusion::shape_size(second);
-
-        //     string firstarg = (arg0_shape.empty() ? "input1" : "input0");
-        //     string secondarg = (arg0_shape.empty() ? "input0" : "input1");
-
-        //     lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
-
-        //     lu << "CUDA_SAFE_CALL(cudaMemcpy(outupt0, " << firstarg << ", " << count << ", cudaMemcpyDeviceToDevice));\n";     // copy `firstarg` to `output0`
-        //     lu << "CUBLAS_SAFE_CALL(nnfusionHalfScale(" << secondarg << ", output0, " << count << "));\n";
-        // }
-        // // case 2: 1d Dot
-        // else if ((arg0_shape.size() == arg1_shape.size()) && (arg0_shape.size() == reduction_axes))
-        // {
-        //     for (int i = 0; i < arg0_shape.size(); i++)
-        //     {
-        //         if (arg0_shape[i] != arg1_shape[i])
-        //         {
-        //             std::vector<std::string> arg_vec{"arg0", "arg1"};
-        //             std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
-
-        //             NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-        //                                 << nnfusion::join(shape_vec) << " respectively, at Node "
-        //                                 << m_context->gnode->get_name()
-        //                                 << ", do not match for dot op";
-        //         }
-        //     }
-
-        //     size_t count = nnfusion::shape_size(arg0_shape);
-        //     lu << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
-
-        //     lu << "CUBLAS_SAFE_CALL(cublasSdot(cublas_handle, " << count
-        //     << ", static_cast<const float*>(input0), 1, static_cast<const float*>(input1), 1, "
-        //         "static_cast<float*>(output0)));\n";
-        // }
-        // // matrix * vector
-        // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) && (reduction_axes == 1))
-        // {
-        //     lu << "const float alpha = 1.0;\n const float beta = 0;\n";
-        //     lu << "CUBLAS_SAFE_CALL(cublasSgemv(cublas_handle, ";
-        //     if (trans_A)
-        //         lu << "CUBLAS_OP_N, " << arg0_shape[0] << ", " << arg0_shape[1] << ", ";
-        //     else
-        //         lu << "CUBLAS_OP_T, " << arg0_shape[1] << ", " << arg0_shape[0] << ", ";
-        //     lu << " &alpha,"
-        //     << " static_cast<const float*>(input0)," << arg0_shape[1] << ", "
-        //     << " static_cast<const float*>(input1),"
-        //     << " 1,"
-        //     << " &beta,"
-        //     << " static_cast<float*>(output0),"
-        //     << " 1));\n";
-        // }
-        // else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 2) && (reduction_axes == 1) &&
-        //         (trans_A || trans_B))
-        // {
-        //     int m = trans_B ? arg1_shape[0] : arg1_shape[1];
-        //     int n = trans_A ? arg0_shape[1] : arg0_shape[0];
-        //     int k = trans_A ? arg0_shape[0] : arg0_shape[1];
-
-        //     lu << "const half alpha = 1.0;\nconst half beta = 0;\n";
-
-        //     lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
-        //     << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,")
-        //     << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << ","
-        //     << " " << n << ","
-        //     << " " << k << ","
-        //     << " &alpha,"
-        //     << " static_cast<const half*>(input1),"
-        //     << " " << arg1_shape[1] << ","
-        //     << " static_cast<const half*>(input0),"
-        //     << " " << arg0_shape[1] << ","
-        //     << " &beta,"
-        //     << " static_cast<half*>(output0),"
-        //     << " " << m << "));\n";
-        // } else {
-        size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
-        size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
-        size_t axes_for_k_count = reduction_axes;
-        size_t m = 1;
-        size_t n = 1;
-        size_t k = 1;
-
-        // check if input and output size correct
-        // check and calculate k for arg0 and arg1
-        size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
-        size_t arg1_k_idx = 0;                // first axe in arg1 for k
-
-        for (size_t i = 0; i < axes_for_k_count; i++)
-        {
-            int m = trans_B ? arg1_shape[0] : arg1_shape[1];
-            int n = trans_A ? arg0_shape[1] : arg0_shape[0];
-            int k = trans_A ? arg0_shape[0] : arg0_shape[1];
-
-            lu << "const half alpha = 1.0;\nconst half beta = 0.;\n";
-
-            lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
-               << (trans_B ? " CUBLAS_OP_T," : " CUBLAS_OP_N,")
-               << (trans_A ? " CUBLAS_OP_T," : " CUBLAS_OP_N,") << " " << m << ","
-               << " " << n << ","
-               << " " << k << ","
-               << " &alpha,"
-               << " static_cast<const half*>(input1),"
-               << " " << arg1_shape[1] << ","
-               << " static_cast<const half*>(input0),"
-               << " " << arg0_shape[1] << ","
-               << " &beta,"
-               << " static_cast<half*>(output0),"
-               << " " << m << "));\n";
-        }
-        else
-        {
-            size_t axes_for_m_count = arg0_shape.size() - reduction_axes;
-            size_t axes_for_n_count = arg1_shape.size() - reduction_axes;
-            size_t axes_for_k_count = reduction_axes;
-            size_t m = 1;
-            size_t n = 1;
-            size_t k = 1;
-
-            // check if input and output size correct
-            // check and calculate k for arg0 and arg1
-            size_t arg0_k_idx = axes_for_m_count; // first axe in arg0 for k
-            size_t arg1_k_idx = 0;                // first axe in arg1 for k
-
-            for (size_t i = 0; i < axes_for_k_count; i++)
-            {
-                k *= arg0_shape[arg0_k_idx];
-                if (arg0_shape[arg0_k_idx++] != arg1_shape[arg1_k_idx++])
-                {
-                    std::vector<std::string> arg_vec{"arg0", "arg1"};
-                    std::vector<nnfusion::Shape> shape_vec{arg0_shape, arg1_shape};
-
-                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                          << nnfusion::join(shape_vec) << " respectively, at Node "
-                                          << m_context->gnode->get_name()
-                                          << ", do not match for dot op";
-                }
-            }
-            // check and calculate m for arg0 and out
-            size_t arg0_m_idx = 0; // first axe in arg0 for m
-            size_t out_m_idx = 0;  // first axe in out for m
-            for (size_t i = 0; i < axes_for_m_count; i++)
-            {
-                m *= arg0_shape[arg0_m_idx];
-                if (arg0_shape[arg0_m_idx++] != out_shape[out_m_idx++])
-                {
-                    std::vector<std::string> arg_vec{"arg0", "output"};
-                    std::vector<nnfusion::Shape> shape_vec{arg0_shape, out_shape};
-
-                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                          << nnfusion::join(shape_vec) << " respectively, at Node "
-                                          << m_context->gnode->get_name()
-                                          << ", do not match for dot op";
-                }
-            }
-            // check and calculate n for arg1 and out
-            size_t arg1_n_idx = axes_for_k_count; // first axe in arg1 for n
-            size_t out_n_idx = axes_for_m_count;  // first axe in arg1 for n
-            for (size_t i = 0; i < axes_for_n_count; i++)
-            {
-                n *= arg1_shape[arg1_n_idx];
-                if (arg1_shape[arg1_n_idx++] != out_shape[out_n_idx++])
-                {
-                    std::vector<std::string> arg_vec{"arg1", "output"};
-                    std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
-
-                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
-                                          << nnfusion::join(shape_vec) << " respectively, at Node "
-                                          << m_context->gnode->get_name()
-                                          << ", do not match for dot op";
-                }
-            }
-
-            lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
-
-            lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
-               << " CUBLAS_OP_N,"
-               << " CUBLAS_OP_N,"
-               << " " << n << ","
-               << " " << m << ","
-               << " " << k << ","
-               << " &alpha,"
-               << " static_cast<const half*>(input1),"
-               << " " << n << ","
-               << " static_cast<const half*>(input0),"
-               << " " << k << ","
-               << " &beta,"
-               << " static_cast<half*>(output0),"
-               << " " << n << "));\n";
-        }
-    }
-    else
-    {
-        NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for kernel dot.";
-    }
     //lu.block_end();
     return _lu;
 }
diff --git a/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp b/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp
deleted file mode 100644
index 1779ad827..000000000
--- a/src/nnfusion/engine/pass/graph/codegen_dxcompute_pass.hpp
+++ /dev/null
@@ -1,533 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "graph_pass_base.hpp"
-#include "nnfusion/core/operators/generic_op/generic_op.hpp"
-#include "nnfusion/core/operators/op_define/constant.hpp"
-#include "nnfusion/engine/profiler/profiler.hpp"
-#include "nnfusion/util/curl_request.hpp"
-
-using namespace nnfusion::graph;
-
-DECLARE_string(fdefault_device);
-DECLARE_string(fantares_codegen_server);
-
-namespace nnfusion
-{
-    namespace pass
-    {
-        namespace graph
-        {
-            class DirectComputeCodegenPass : public GraphPassBase
-            {
-                std::string currentBackend;
-                std::string autogen(const std::string& expr)
-                {
-                    if (FLAGS_fantares_codegen_server == "")
-                        FLAGS_fantares_codegen_server = "10.150.145.98:8884";
-                    static std::unordered_map<std::string, std::string> code_cache;
-                    std::string response;
-                    auto it = code_cache.find(expr);
-                    if (it == code_cache.end())
-                    {
-                        CurlRequest req(FLAGS_fantares_codegen_server);
-                        req.add_custom_header(("COMPUTE_V1: " + expr).c_str());
-                        req.add_custom_header("ARGS: ");
-
-                        printf("[Autogen] %s\n", expr.c_str());
-                        NNFUSION_CHECK(true == req.send_request(response));
-                        NNFUSION_CHECK(strncmp(response.c_str(), "[ERROR]", 7) != 0) << expr << "\n"
-                                                                                     << response;
-                        code_cache[expr] = response;
-                        return std::move(response);
-                    }
-                    else
-                        return it->second;
-                }
-
-                template <class T1, class T2>
-                inline std::string
-                    join_collections(const T1& vect, T2 func, bool skip_empty = false)
-                {
-                    std::stringstream result;
-                    int idx = 0;
-                    for (auto& it : vect)
-                    {
-                        auto str = func(idx, it);
-                        if (!str.size() && skip_empty)
-                            continue;
-                        if (idx > 0)
-                            result << ", ";
-                        result << str;
-                        ++idx;
-                    }
-                    return result.str();
-                }
-
-                // inline int get_type_id(nnfusion::element::Type type)
-                // {
-                //     // TODO: fill more type cases
-                //     if (type == nnfusion::element::f32)
-                //         return DT_FLOAT;
-                //     throw std::runtime_error("Not supported element type.");
-                // }
-
-                template <class T>
-                inline std::shared_ptr<T> get_op_object(std::shared_ptr<GNode>& curr)
-                {
-                    auto _op = static_pointer_cast<T>(curr->get_op_ptr());
-                    NNFUSION_CHECK_NOT_NULLPTR(_op) << "Node type is not "
-                                                    << curr->get_op_ptr()->get_op_type();
-                    return _op;
-                }
-
-                inline void UNHANDLED_CASE(std::shared_ptr<GNode>& curr)
-                {
-                    printf("## Unhandled case for %s:\n",
-                           curr->get_op_ptr()->get_op_type().c_str());
-                    for (int i = 0; i < curr->get_input_size(); ++i)
-                        printf(">> in-%d : %s\n",
-                               i,
-                               vector_to_string(curr->get_input_shape(i)).c_str());
-                    for (int i = 0; i < curr->get_output_size(); ++i)
-                        printf(">> out-%d: %s\n",
-                               i,
-                               vector_to_string(curr->get_output_shape(i)).c_str());
-                    exit(1);
-                };
-
-            public:
-                bool run_on_graph(std::shared_ptr<Graph>& graph) override
-                {
-                    currentBackend = "dxcompute";
-
-                    NNFUSION_LOG(INFO) << "Codegen for " << currentBackend << " starts up.";
-
-                    auto nodes = graph->get_nodes();
-                    std::unordered_map<std::shared_ptr<GNode>, int> din, dout;
-
-                    // Count degrees
-                    for (auto& it : nodes)
-                    {
-                        for (auto& in_edge : it->get_in_edges())
-                        {
-                            if (in_edge->is_control_edge())
-                                continue;
-                            NNFUSION_CHECK(in_edge->get_dst() == it);
-                            din[it]++;
-                            dout[in_edge->get_src()]++;
-                        }
-                    }
-
-                    // Name nodes, legality checks
-                    std::unordered_set<std::shared_ptr<GNode>> visited, vis_pend, blacklist;
-                    std::unordered_set<std::string> name_used;
-                    std::unordered_map<std::shared_ptr<GNode>, std::string> arg_names;
-                    for (auto& it : nodes)
-                    {
-                        NNFUSION_CHECK(it.get() != nullptr);
-
-                        auto arg_name = "Z0_" + it->get_op_ptr()->get_op_type() + "_" +
-                                        it->get_op_ptr()->get_name();
-                        for (auto& c : arg_name)
-                            if (!isalpha(c) && !isdigit(c))
-                                c = '_';
-                        if (name_used.count(arg_name))
-                        {
-                            for (int i = 1;; ++i)
-                            {
-                                auto alter = arg_name + "_" + std::to_string(i);
-                                if (!name_used.count(alter))
-                                {
-                                    arg_name = alter;
-                                    break;
-                                }
-                            }
-                        }
-                        name_used.insert(arg_name);
-                        arg_names[it] = arg_name;
-
-                        if (din[it] == 0 && dout[it] == 0)
-                            visited.insert(it), blacklist.insert(it);
-                        NNFUSION_CHECK(it->get_output_size() == 1);
-                    }
-                    NNFUSION_LOG(INFO) << "There are " << blacklist.size()
-                                       << " standalone GNode(s) found.";
-                    name_used.clear();
-
-                    // Fill offsetup nodes
-                    std::deque<std::shared_ptr<GNode>> gen_q, pend_q;
-                    for (auto& it : nodes)
-                    {
-                        if (visited.count(it))
-                            continue;
-                        if (din[it] == 0)
-                        {
-                            gen_q.push_back(it);
-                        }
-                    }
-
-                    NNFUSION_CHECK(
-                        0 ==
-                        system(("mkdir -p nnfusion_rt/" + currentBackend + "_codegen").c_str()));
-
-                    std::ofstream fout("nnfusion_rt/" + currentBackend + "_codegen/nnfusion_rt.h");
-
-                    fout << "#if 1\n\n";
-                    // Perform blockfusion
-                    int offset = 0, step = 0;
-                    auto new_super_step = [&]() {
-                        while (pend_q.size())
-                        {
-                            gen_q.push_back(pend_q.front());
-                            pend_q.pop_front();
-                        }
-                        if (offset > 0)
-                            ++step, offset = 0;
-                    };
-
-                    auto print_standard_codegen = [&](std::shared_ptr<GNode>& curr,
-                                                      std::ofstream& fout,
-                                                      std::string ir,
-                                                      std::string options) {
-                        std::string code = autogen(ir);
-
-                        if (options.size() > 0)
-                        {
-                            if (options[0] != '|')
-                                options = "|" + options;
-                            if (options.back() != '|')
-                                options += "|";
-                        }
-
-                        if (int(options.find("|memcpy|")) >= 0)
-                        {
-                            NNFUSION_CHECK(curr->get_input_size() == 1);
-                            fout << "NNfusionTensor &" << arg_names[curr] << " = "
-                                 << arg_names[curr->get_in_edge(0)->get_src()] << ";\n";
-                            return;
-                        }
-
-                        static std::unordered_map<std::string, std::string> dedupe_kernels;
-                        auto kernel = dedupe_kernels.find(code);
-                        if (kernel == dedupe_kernels.end())
-                        {
-                            NNFUSION_CHECK(0 == system(("mkdir -p nnfusion_rt/" + currentBackend +
-                                                        "_codegen/HLSL")
-                                                           .c_str()));
-                            FILE* fp = fopen(("nnfusion_rt/" + currentBackend + "_codegen/HLSL/" +
-                                              arg_names[curr] + ".hlsl")
-                                                 .c_str(),
-                                             "wb");
-                            NNFUSION_CHECK(fp != nullptr);
-                            NNFUSION_CHECK(code.size() == fwrite(code.c_str(), 1, code.size(), fp));
-                            fclose(fp);
-                            dedupe_kernels[code] = arg_names[curr];
-                            kernel = dedupe_kernels.find(code);
-                        }
-
-                        fout << "// " << ir << "\n";
-                        if (int(options.find("|inplace_wg|")) < 0)
-                        {
-                            fout << "NNfusionTensor " << arg_names[curr] << "(device, {"
-                                 << join_collections(
-                                        curr->get_output_shape(0),
-                                        [](int idx, ssize_t it) { return std::to_string(it); })
-                                 << "}, sizeof(" << curr->get_output_element_type(0).c_type_string()
-                                 << "));\n";
-
-                            fout << "  NNfusionOperator op_" << arg_names[curr] << "(device, {";
-                            for (int i = 0; i < curr->get_input_size(); ++i)
-                            {
-                                if (i)
-                                    fout << ", ";
-                                fout << arg_names[curr->get_in_edge(i)->get_src()];
-                            }
-                            fout << "}, { " << arg_names[curr] << " }, L\"" << kernel->second
-                                 << ".hlsl\");";
-                        }
-                        else
-                        {
-                            fout << "  NNfusionOperator op_" << arg_names[curr] << "(device, {";
-                            for (int i = 0; i < curr->get_input_size(); ++i)
-                            {
-                                if (i)
-                                    fout << ", ";
-                                fout << arg_names[curr->get_in_edge(i)->get_src()];
-                            }
-                            fout << "}, { " << arg_names[curr->get_in_edge(0)->get_src()]
-                                 << " }, L\"" << kernel->second << ".hlsl\");\n";
-                            fout << "auto& " << arg_names[curr] << " = "
-                                 << arg_names[curr->get_in_edge(0)->get_src()] << ";";
-                        }
-                    };
-
-                    auto codegen_for_elementwise = [&](std::shared_ptr<GNode>& curr,
-                                                       std::ofstream& fout,
-                                                       const std::string& topi,
-                                                       const std::string& options = "") {
-                        std::string expr = " -";
-                        for (int i = 0; i < curr->get_input_size(); ++i)
-                            expr += " input(\"input" + std::to_string(i) + "\", @common_shape@);";
-                        expr += " output(@common_shape@, " + topi + ");";
-
-                        int num_elements = 1, y;
-                        for (auto& it : curr->get_input_shape(0))
-                            num_elements *= it;
-
-                        print_standard_codegen(
-                            curr,
-                            fout,
-                            op::create_code_from_template(
-                                expr,
-                                {{"common_shape", "[ " + std::to_string(num_elements) + " ]"}}),
-                            options);
-                    };
-
-                    std::unordered_map<std::string,
-                                       std::function<void(std::shared_ptr<GNode>&, std::ofstream&)>>
-                        kernel_dict;
-
-                    // Elementwise Ops
-                    kernel_dict["Subtract"] = [&](std::shared_ptr<GNode>& curr,
-                                                  std::ofstream& fout) {
-                        codegen_for_elementwise(
-                            curr, fout, "topi=topi.subtract(args(\"input0\"), args(\"input1\"))");
-                    };
-                    kernel_dict["Multiply"] = [&](std::shared_ptr<GNode>& curr,
-                                                  std::ofstream& fout) {
-                        codegen_for_elementwise(
-                            curr, fout, "topi=topi.multiply(args(\"input0\"), args(\"input1\"))");
-                    };
-                    kernel_dict["Divide"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
-                        codegen_for_elementwise(
-                            curr, fout, "topi=topi.divide(args(\"input0\"), args(\"input1\"))");
-                    };
-                    kernel_dict["DivNoNan"] = [&](std::shared_ptr<GNode>& curr,
-                                                  std::ofstream& fout) {
-                        codegen_for_elementwise(
-                            curr,
-                            fout,
-                            "lambda x: tvm.te.if_then_else(args(\"input1\")[x] != "
-                            "0, args(\"input0\")[x] / args(\"input1\")[x], 0)");
-                    };
-                    kernel_dict["Power"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
-                        codegen_for_elementwise(
-                            curr, fout, "topi=topi.power(args(\"input0\"), args(\"input1\"))");
-                    };
-                    kernel_dict["LessEq"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
-                        codegen_for_elementwise(
-                            curr, fout, "topi=topi.less_equal(args(\"input0\"), args(\"input1\"))");
-                    };
-                    kernel_dict["Equal"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
-                        codegen_for_elementwise(
-                            curr, fout, "topi=topi.equal(args(\"input0\"), args(\"input1\"))");
-                    };
-                    kernel_dict["Maximum"] = [&](std::shared_ptr<GNode>& curr,
-                                                 std::ofstream& fout) {
-                        codegen_for_elementwise(
-                            curr, fout, "topi=topi.maximum(args(\"input0\"), args(\"input1\"))");
-                    };
-                    kernel_dict["Exp"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
-                        codegen_for_elementwise(curr, fout, "topi=topi.exp(args(\"input0\"))");
-                    };
-                    kernel_dict["Negative"] = [&](std::shared_ptr<GNode>& curr,
-                                                  std::ofstream& fout) {
-                        codegen_for_elementwise(curr, fout, "topi=topi.negative(args(\"input0\"))");
-                    };
-                    kernel_dict["Tanh"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
-                        codegen_for_elementwise(curr, fout, "topi=topi.tanh(args(\"input0\"))");
-                    };
-                    kernel_dict["Relu6"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
-                        codegen_for_elementwise(
-                            curr, fout, "topi=topi.clip(args(\"input0\"), 0, 6)");
-                    };
-                    kernel_dict["Sigmoid"] = [&](std::shared_ptr<GNode>& curr,
-                                                 std::ofstream& fout) {
-                        codegen_for_elementwise(curr, fout, "topi=topi.sigmoid(args(\"input0\"))");
-                    };
-                    kernel_dict["Square"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
-                        codegen_for_elementwise(
-                            curr, fout, "topi=topi.multiply(args(\"input0\"), args(\"input0\"))");
-                    };
-                    kernel_dict["Rsqrt"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
-                        codegen_for_elementwise(curr, fout, "topi=topi.rsqrt(args(\"input0\"))");
-                    };
-                    kernel_dict["Log"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
-                        codegen_for_elementwise(curr, fout, "topi=topi.log(args(\"input0\"))");
-                    };
-                    kernel_dict["ReluBackprop"] = [&](std::shared_ptr<GNode>& curr,
-                                                      std::ofstream& fout) {
-                        codegen_for_elementwise(
-                            curr,
-                            fout,
-                            "lambda x: tvm.te.if_then_else(args(\"input0\")[x] > "
-                            "0, args(\"input1\")[x], 0)");
-                    };
-                    kernel_dict["Select"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
-                        codegen_for_elementwise(
-                            curr,
-                            fout,
-                            "lambda x: tvm.te.if_then_else(args(\"input0\")[x] == "
-                            "0, args(\"input2\")[x], args(\"input1\")[x])");
-                    };
-
-                    // Non-standard Ops
-                    kernel_dict["Constant"] = [&](std::shared_ptr<GNode>& curr,
-                                                  std::ofstream& fout) {
-                        auto p_const = std::dynamic_pointer_cast<op::Constant>(curr->get_op_ptr());
-                        NNFUSION_CHECK(p_const != nullptr);
-                        const void* dptr = p_const->get_data_ptr();
-                        size_t size = p_const->get_data_size();
-
-                        NNFUSION_CHECK(0 == system(("mkdir -p nnfusion_rt/" + currentBackend +
-                                                    "_codegen/Constant")
-                                                       .c_str()));
-                        FILE* fp = fopen(("nnfusion_rt/" + currentBackend + "_codegen/Constant/" +
-                                          arg_names[curr])
-                                             .c_str(),
-                                         "wb");
-                        NNFUSION_CHECK(fp != nullptr);
-                        NNFUSION_CHECK(size == fwrite(dptr, 1, size, fp));
-                        fclose(fp);
-
-                        fout << "NNfusionTensor " << arg_names[curr] << "(device, {"
-                             << join_collections(
-                                    curr->get_output_shape(0),
-                                    [](int idx, ssize_t it) { return std::to_string(it); })
-                             << "}, sizeof(" << curr->get_output_element_type(0).c_type_string()
-                             << "));\n";
-
-                        fout << "  NNfusionMemcpy op_" << arg_names[curr] << "(device, "
-                             << arg_names[curr] << ", load_data<"
-                             << curr->get_output_element_type(0).c_type_string() << ">(\""
-                             << arg_names[curr] << "\", " << arg_names[curr]
-                             << ".NumElements()), true);\n";
-                    };
-
-                    kernel_dict["Parameter"] = [&](std::shared_ptr<GNode>& curr,
-                                                   std::ofstream& fout) {
-                        fout << "NNfusionTensor " << arg_names[curr] << "(device, {"
-                             << join_collections(
-                                    curr->get_output_shape(0),
-                                    [](int idx, ssize_t it) { return std::to_string(it); })
-                             << "}, sizeof(" << curr->get_output_element_type(0).c_type_string()
-                             << "));\n";
-
-                        fout << "  NNfusionMemcpy op_" << arg_names[curr] << "(device, "
-                             << arg_names[curr] << ", load_data<"
-                             << curr->get_output_element_type(0).c_type_string() << ">(\"\", "
-                             << arg_names[curr] << ".NumElements()));\n";
-                    };
-
-                    kernel_dict["Result"] = [&](std::shared_ptr<GNode>& curr, std::ofstream& fout) {
-                        fout << "NNfusionMemcpy " << arg_names[curr] << "(device, nullptr, "
-                             << arg_names[curr->get_in_edge(0)->get_src()] << ");\n";
-                    };
-
-                    while (gen_q.size() > 0 || pend_q.size() > 0)
-                    {
-                        // Move to new super step if satisifed
-                        if (!gen_q.size())
-                            new_super_step();
-
-                        auto curr = gen_q.front();
-                        gen_q.pop_front();
-                        visited.insert(curr);
-
-                        auto entry = kernel_dict.find(curr->get_op_ptr()->get_op_type());
-                        if (entry != kernel_dict.end())
-                            entry->second(curr, fout);
-                        else
-                        {
-                            auto ir = nnfusion::op::get_translation_v2(curr);
-                            if (ir.empty())
-                                ir = nnfusion::op::get_translation(curr);
-                            if (ir != "")
-                            {
-                                const char annotation[] = "## @annotation: ";
-                                int pos = ir.find(annotation);
-                                std::string options;
-                                if (pos >= 0)
-                                {
-                                    pos += sizeof(annotation) - 1;
-                                    options = ir.substr(pos);
-                                }
-                                print_standard_codegen(curr, fout, ir, options);
-                            }
-                            else
-                                UNHANDLED_CASE(curr);
-                        }
-                        fout << std::endl;
-
-                        // Check its children about whether all inputs are ready (Must be put after any possible new_super_step())
-                        for (auto& edge : curr->get_out_edges())
-                        {
-                            if (edge->is_control_edge())
-                                continue;
-                            NNFUSION_CHECK(edge->get_src() == curr);
-                            NNFUSION_CHECK(visited.count(edge->get_dst()) == 0);
-
-                            bool ready = true;
-                            for (auto& from : edge->get_dst()->get_in_edges())
-                            {
-                                if (from->is_control_edge())
-                                    continue;
-                                if (visited.count(from->get_src()) == 0)
-                                {
-                                    ready = false;
-                                    break;
-                                }
-                            }
-                            if (ready)
-                            {
-                                // Only join pend_q once
-                                if (vis_pend.count(edge->get_dst()) == 0)
-                                {
-                                    vis_pend.insert(edge->get_dst());
-                                    pend_q.push_back(edge->get_dst());
-                                }
-                            }
-                        }
-                    }
-
-                    fout << "#endif\n\n";
-                    fout << R"(
-  device.pCommandQueue->ExecuteCommandLists(preloadQueue.size(), preloadQueue.data());
-  device.pCommandQueue->ExecuteCommandLists(cmdQueue.size(), cmdQueue.data());
-  device.AwaitExecution();
-)";
-                    // Print Results
-                    for (auto& curr : graph->get_outputs()) // Print output nodes
-                    {
-                        if (blacklist.count(curr))
-                            continue;
-                        fout << arg_names[curr] << ".PrintStageBuffer<"
-                             << curr->get_output_element_type(0).c_type_string() << ">(device, \""
-                             << arg_names[curr] << "\");\n";
-                    }
-
-                    fout << std::endl;
-
-                    nnfusion::codegen::copy_file_from_templates(
-                        currentBackend + "/DxCompute.vcxproj",
-                        "nnfusion_rt/" + currentBackend + "_codegen/DxCompute.vcxproj");
-                    nnfusion::codegen::copy_file_from_templates(currentBackend + "/run_graph.cpp",
-                                                                "nnfusion_rt/" + currentBackend +
-                                                                    "_codegen/run_graph.cpp");
-                    nnfusion::codegen::copy_file_from_templates(currentBackend + "/d3dx12_helper.h",
-                                                                "nnfusion_rt/" + currentBackend +
-                                                                    "_codegen/d3dx12_helper.h");
-                    nnfusion::codegen::copy_file_from_templates(
-                        currentBackend + "/d3dx12_nnfusion.h",
-                        "nnfusion_rt/" + currentBackend + "_codegen/d3dx12_nnfusion.h");
-                    NNFUSION_LOG(INFO) << currentBackend << " codegen finished.";
-                    exit(0);
-                    return true;
-                }
-            };
-        } // namespace pass
-    }     // namespace graph
-} // namespace nnfusion
diff --git a/src/nnfusion/frontend/onnx_import/util/util.cpp b/src/nnfusion/frontend/onnx_import/util/util.cpp
index 28a56620f..6cd9f4316 100644
--- a/src/nnfusion/frontend/onnx_import/util/util.cpp
+++ b/src/nnfusion/frontend/onnx_import/util/util.cpp
@@ -91,35 +91,6 @@ namespace nnfusion
                 element::Type element_type = tensor.get_ng_type();
                 return std::make_shared<op::Constant>(
                     element_type, shape, tensor.buffer_get_data());
-                // switch (onnx_et)
-                // {
-                // case onnx::TensorProto_DataType::TensorProto_DataType_BOOL:
-                //     return make_constant_op<bool>(element::boolean, shape, tensor);
-                // case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT:
-                // case onnx::TensorProto_DataType::TensorProto_DataType_FLOAT16:
-                //     return make_constant_op<float>(element::f32, shape, tensor);
-                // case onnx::TensorProto_DataType::TensorProto_DataType_DOUBLE:
-                //     return make_constant_op<double>(element::f64, shape, tensor);
-                // case onnx::TensorProto_DataType::TensorProto_DataType_INT8:
-                //     return make_constant_op<int8_t>(element::i8, shape, tensor);
-                // case onnx::TensorProto_DataType::TensorProto_DataType_INT16:
-                //     return make_constant_op<int16_t>(element::i16, shape, tensor);
-                // case onnx::TensorProto_DataType::TensorProto_DataType_INT32:
-                //     return make_constant_op<int32_t>(element::i32, shape, tensor);
-                // case onnx::TensorProto_DataType::TensorProto_DataType_INT64:
-                //     return make_constant_op<int64_t>(element::i64, shape, tensor);
-                // case onnx::TensorProto_DataType::TensorProto_DataType_UINT8:
-                //     return make_constant_op<uint8_t>(element::u8, shape, tensor);
-                // case onnx::TensorProto_DataType::TensorProto_DataType_UINT16:
-                //     return make_constant_op<uint16_t>(element::u16, shape, tensor);
-                // case onnx::TensorProto_DataType::TensorProto_DataType_UINT32:
-                //     return make_constant_op<uint32_t>(element::u32, shape, tensor);
-                // case onnx::TensorProto_DataType::TensorProto_DataType_UINT64:
-                //     return make_constant_op<uint64_t>(element::u64, shape, tensor);
-                // default:
-                //     NNFUSION_CHECK_FAIL() << "unsupported value info element type: "
-                //                           << onnx::TensorProto_DataType_Name(onnx_et);
-                // }
             }
 
             std::shared_ptr<graph::GNode> GetInputNode(const NodeMap& all_ng_nodes,

From ef5276726434c107f4a226cfd072c72099622755 Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Wed, 10 Mar 2021 15:11:58 +0800
Subject: [PATCH 29/32] codesytle

---
 .../core/kernels/cuda_gpu/kernels/dot.cpp     | 161 +++++++++---------
 .../core/kernels/kernel_registration.cpp      |   1 -
 2 files changed, 81 insertions(+), 81 deletions(-)

diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
index 712da73c5..604e35e7b 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
@@ -285,92 +285,93 @@ LanguageUnit_p cuda::Dot::emit_function_body()
                     std::vector<std::string> arg_vec{"arg1", "output"};
                     std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
 
-        lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
-
-        lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
-           << " CUBLAS_OP_N,"
-           << " CUBLAS_OP_N,"
-           << " " << n << ","
-           << " " << m << ","
-           << " " << k << ","
-           << " &alpha,"
-           << " static_cast<const half*>(input1),"
-           << " " << n << ","
-           << " static_cast<const half*>(input0),"
-           << " " << k << ","
-           << " &beta,"
-           << " static_cast<half*>(output0),"
-           << " " << n << "));\n";
-        // }
-        
-    } else {
-        NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot."
-    }
-    //lu.block_end();
-    return _lu;
-}
+                    lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
+
+                    lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
+                       << " CUBLAS_OP_N,"
+                       << " CUBLAS_OP_N,"
+                       << " " << n << ","
+                       << " " << m << ","
+                       << " " << k << ","
+                       << " &alpha,"
+                       << " static_cast<const half*>(input1),"
+                       << " " << n << ","
+                       << " static_cast<const half*>(input0),"
+                       << " " << k << ","
+                       << " &beta,"
+                       << " static_cast<half*>(output0),"
+                       << " " << n << "));\n";
+                    // }
+                }
+                else
+                {
+                    NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot."
+                }
+                //lu.block_end();
+                return _lu;
+            }
 
-LanguageUnit_p cuda::Dot::emit_dependency()
-{
-    LanguageUnit_p _lu(new LanguageUnit(get_function_name() + "_dep"));
-    _lu->require(header::cuda);
-    _lu->require(header::cublas);
-    _lu->require(header::stdexcept);
-    _lu->require(header::sstream);
-    _lu->require(macro::CUBLAS_SAFE_CALL);
-    _lu->require(macro::CUDA_SAFE_CALL);
-    // _lu->require(declaration::cuda_fp16_scale);
-    //_lu->require(declaration::cublas_handle);
-    return _lu;
-}
+            LanguageUnit_p cuda::Dot::emit_dependency()
+            {
+                LanguageUnit_p _lu(new LanguageUnit(get_function_name() + "_dep"));
+                _lu->require(header::cuda);
+                _lu->require(header::cublas);
+                _lu->require(header::stdexcept);
+                _lu->require(header::sstream);
+                _lu->require(macro::CUBLAS_SAFE_CALL);
+                _lu->require(macro::CUDA_SAFE_CALL);
+                // _lu->require(declaration::cuda_fp16_scale);
+                //_lu->require(declaration::cublas_handle);
+                return _lu;
+            }
 
-LanguageUnit_p cuda::Dot::emit_function_signature()
-{
-    LanguageUnit_p _lu(new LanguageUnit(this->m_kernel_name + "_sig"));
-    auto& lu = *_lu;
+            LanguageUnit_p cuda::Dot::emit_function_signature()
+            {
+                LanguageUnit_p _lu(new LanguageUnit(this->m_kernel_name + "_sig"));
+                auto& lu = *_lu;
 
-    vector<string> params;
-    for (size_t i = 0; i < m_context->inputs.size(); i++)
-    {
-        stringstream ss;
-        ss << m_context->inputs[i]->get_element_type().c_type_string() << "* ";
-        ss << "input" << i;
-        params.push_back(ss.str());
-    }
+                vector<string> params;
+                for (size_t i = 0; i < m_context->inputs.size(); i++)
+                {
+                    stringstream ss;
+                    ss << m_context->inputs[i]->get_element_type().c_type_string() << "* ";
+                    ss << "input" << i;
+                    params.push_back(ss.str());
+                }
 
-    for (size_t i = 0; i < m_context->outputs.size(); i++)
-    {
-        stringstream ss;
-        ss << m_context->outputs[i]->get_element_type().c_type_string() << "* ";
-        ss << "output" << i;
-        params.push_back(ss.str());
-    }
+                for (size_t i = 0; i < m_context->outputs.size(); i++)
+                {
+                    stringstream ss;
+                    ss << m_context->outputs[i]->get_element_type().c_type_string() << "* ";
+                    ss << "output" << i;
+                    params.push_back(ss.str());
+                }
 
-    for (size_t i = 0; i < m_context->tensors.size(); i++)
-    {
-        stringstream ss;
-        ss << m_context->tensors[i]->get_element_type().c_type_string() << "* ";
-        // defult name is: "persit0", "persist1" ...
-        ss << m_context->tensors[i]->get_name();
-        params.push_back(ss.str());
-    }
+                for (size_t i = 0; i < m_context->tensors.size(); i++)
+                {
+                    stringstream ss;
+                    ss << m_context->tensors[i]->get_element_type().c_type_string() << "* ";
+                    // defult name is: "persit0", "persist1" ...
+                    ss << m_context->tensors[i]->get_name();
+                    params.push_back(ss.str());
+                }
 
-    lu << "void "
-       << "(cublasHandle_t cublas_handle, " << join(params, ", ") << ")";
-    return _lu;
-}
+                lu << "void "
+                   << "(cublasHandle_t cublas_handle, " << join(params, ", ") << ")";
+                return _lu;
+            }
 
-REGISTER_KERNEL_EMITTER(
-    "Dot",                                                                   // op_name
-    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs
-    cuda::Dot)                                                               // constructor
+            REGISTER_KERNEL_EMITTER(
+                "Dot",                                                                   // op_name
+                Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs
+                cuda::Dot) // constructor
 
-REGISTER_KERNEL_EMITTER(
-    "Dot",                                                                   // op_name
-    Device(CUDA_GPU).TypeConstraint(element::f16).Tag("cublas").Priority(2), // attrs
-    cuda::Dot)                                                               // constructor
+            REGISTER_KERNEL_EMITTER(
+                "Dot",                                                                   // op_name
+                Device(CUDA_GPU).TypeConstraint(element::f16).Tag("cublas").Priority(2), // attrs
+                cuda::Dot) // constructor
 
-REGISTER_KERNEL_EMITTER(
-    "Dot",                                                                   // op_name
-    Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs
-    cuda::Dot)                                                               // constructor
+            REGISTER_KERNEL_EMITTER(
+                "Dot",                                                                   // op_name
+                Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs
+                cuda::Dot) // constructor
diff --git a/src/nnfusion/core/kernels/kernel_registration.cpp b/src/nnfusion/core/kernels/kernel_registration.cpp
index 786c41d87..79d2a17c2 100644
--- a/src/nnfusion/core/kernels/kernel_registration.cpp
+++ b/src/nnfusion/core/kernels/kernel_registration.cpp
@@ -4,7 +4,6 @@
 #include "kernel_registration.hpp"
 #include "nnfusion/common/type/element_type.hpp"
 #include "nnfusion/util/util.hpp"
-#include "ngraph/src/nnfusion/common/type/element_type.hpp"
 #include "nnfusion/util/util.hpp"
 
 using namespace nnfusion;

From 80904a24b47947b9e311999969c03103d6e3e7df Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Wed, 10 Mar 2021 15:24:59 +0800
Subject: [PATCH 30/32] fix dot

---
 .../core/kernels/cuda_gpu/kernels/dot.cpp     | 165 +++++++++---------
 1 file changed, 86 insertions(+), 79 deletions(-)

diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
index 604e35e7b..5896bbcae 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
@@ -285,93 +285,100 @@ LanguageUnit_p cuda::Dot::emit_function_body()
                     std::vector<std::string> arg_vec{"arg1", "output"};
                     std::vector<nnfusion::Shape> shape_vec{arg1_shape, out_shape};
 
-                    lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
-
-                    lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
-                       << " CUBLAS_OP_N,"
-                       << " CUBLAS_OP_N,"
-                       << " " << n << ","
-                       << " " << m << ","
-                       << " " << k << ","
-                       << " &alpha,"
-                       << " static_cast<const half*>(input1),"
-                       << " " << n << ","
-                       << " static_cast<const half*>(input0),"
-                       << " " << k << ","
-                       << " &beta,"
-                       << " static_cast<half*>(output0),"
-                       << " " << n << "));\n";
-                    // }
-                }
-                else
-                {
-                    NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot."
+                    NNFUSION_CHECK_FAIL() << nnfusion::join(arg_vec) << " with "
+                                          << nnfusion::join(shape_vec) << " respectively, at Node "
+                                          << m_context->gnode->get_name()
+                                          << ", do not match for dot op";
                 }
-                //lu.block_end();
-                return _lu;
             }
 
-            LanguageUnit_p cuda::Dot::emit_dependency()
-            {
-                LanguageUnit_p _lu(new LanguageUnit(get_function_name() + "_dep"));
-                _lu->require(header::cuda);
-                _lu->require(header::cublas);
-                _lu->require(header::stdexcept);
-                _lu->require(header::sstream);
-                _lu->require(macro::CUBLAS_SAFE_CALL);
-                _lu->require(macro::CUDA_SAFE_CALL);
-                // _lu->require(declaration::cuda_fp16_scale);
-                //_lu->require(declaration::cublas_handle);
-                return _lu;
-            }
+            lu << "const half alpha = 1.0f;\nconst half beta = 0.f;\n";
 
-            LanguageUnit_p cuda::Dot::emit_function_signature()
-            {
-                LanguageUnit_p _lu(new LanguageUnit(this->m_kernel_name + "_sig"));
-                auto& lu = *_lu;
+            lu << "CUBLAS_SAFE_CALL(cublasHgemm(cublas_handle,"
+               << " CUBLAS_OP_N,"
+               << " CUBLAS_OP_N,"
+               << " " << n << ","
+               << " " << m << ","
+               << " " << k << ","
+               << " &alpha,"
+               << " static_cast<const half*>(input1),"
+               << " " << n << ","
+               << " static_cast<const half*>(input0),"
+               << " " << k << ","
+               << " &beta,"
+               << " static_cast<half*>(output0),"
+               << " " << n << "));\n";
+        }
+    }
+    else
+    {
+        NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot."
+    }
+    //lu.block_end();
+    return _lu;
+}
 
-                vector<string> params;
-                for (size_t i = 0; i < m_context->inputs.size(); i++)
-                {
-                    stringstream ss;
-                    ss << m_context->inputs[i]->get_element_type().c_type_string() << "* ";
-                    ss << "input" << i;
-                    params.push_back(ss.str());
-                }
+LanguageUnit_p cuda::Dot::emit_dependency()
+{
+    LanguageUnit_p _lu(new LanguageUnit(get_function_name() + "_dep"));
+    _lu->require(header::cuda);
+    _lu->require(header::cublas);
+    _lu->require(header::stdexcept);
+    _lu->require(header::sstream);
+    _lu->require(macro::CUBLAS_SAFE_CALL);
+    _lu->require(macro::CUDA_SAFE_CALL);
+    // _lu->require(declaration::cuda_fp16_scale);
+    //_lu->require(declaration::cublas_handle);
+    return _lu;
+}
 
-                for (size_t i = 0; i < m_context->outputs.size(); i++)
-                {
-                    stringstream ss;
-                    ss << m_context->outputs[i]->get_element_type().c_type_string() << "* ";
-                    ss << "output" << i;
-                    params.push_back(ss.str());
-                }
+LanguageUnit_p cuda::Dot::emit_function_signature()
+{
+    LanguageUnit_p _lu(new LanguageUnit(this->m_kernel_name + "_sig"));
+    auto& lu = *_lu;
 
-                for (size_t i = 0; i < m_context->tensors.size(); i++)
-                {
-                    stringstream ss;
-                    ss << m_context->tensors[i]->get_element_type().c_type_string() << "* ";
-                    // defult name is: "persit0", "persist1" ...
-                    ss << m_context->tensors[i]->get_name();
-                    params.push_back(ss.str());
-                }
+    vector<string> params;
+    for (size_t i = 0; i < m_context->inputs.size(); i++)
+    {
+        stringstream ss;
+        ss << m_context->inputs[i]->get_element_type().c_type_string() << "* ";
+        ss << "input" << i;
+        params.push_back(ss.str());
+    }
 
-                lu << "void "
-                   << "(cublasHandle_t cublas_handle, " << join(params, ", ") << ")";
-                return _lu;
-            }
+    for (size_t i = 0; i < m_context->outputs.size(); i++)
+    {
+        stringstream ss;
+        ss << m_context->outputs[i]->get_element_type().c_type_string() << "* ";
+        ss << "output" << i;
+        params.push_back(ss.str());
+    }
+
+    for (size_t i = 0; i < m_context->tensors.size(); i++)
+    {
+        stringstream ss;
+        ss << m_context->tensors[i]->get_element_type().c_type_string() << "* ";
+        // defult name is: "persit0", "persist1" ...
+        ss << m_context->tensors[i]->get_name();
+        params.push_back(ss.str());
+    }
+
+    lu << "void "
+       << "(cublasHandle_t cublas_handle, " << join(params, ", ") << ")";
+    return _lu;
+}
 
-            REGISTER_KERNEL_EMITTER(
-                "Dot",                                                                   // op_name
-                Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs
-                cuda::Dot) // constructor
+REGISTER_KERNEL_EMITTER(
+    "Dot",                                                                   // op_name
+    Device(CUDA_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs
+    cuda::Dot)                                                               // constructor
 
-            REGISTER_KERNEL_EMITTER(
-                "Dot",                                                                   // op_name
-                Device(CUDA_GPU).TypeConstraint(element::f16).Tag("cublas").Priority(2), // attrs
-                cuda::Dot) // constructor
+REGISTER_KERNEL_EMITTER(
+    "Dot",                                                                   // op_name
+    Device(CUDA_GPU).TypeConstraint(element::f16).Tag("cublas").Priority(2), // attrs
+    cuda::Dot)                                                               // constructor
 
-            REGISTER_KERNEL_EMITTER(
-                "Dot",                                                                   // op_name
-                Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs
-                cuda::Dot) // constructor
+REGISTER_KERNEL_EMITTER(
+    "Dot",                                                                   // op_name
+    Device(ROCM_GPU).TypeConstraint(element::f32).Tag("cublas").Priority(2), // attrs
+    cuda::Dot)                                                               // constructor

From 65cc6f7fbeb4bb1a2b73ffbc303d35e479161a3e Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Wed, 10 Mar 2021 15:37:46 +0800
Subject: [PATCH 31/32] repetitive include

---
 src/nnfusion/core/kernels/kernel_registration.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/nnfusion/core/kernels/kernel_registration.cpp b/src/nnfusion/core/kernels/kernel_registration.cpp
index 79d2a17c2..67ded5b8f 100644
--- a/src/nnfusion/core/kernels/kernel_registration.cpp
+++ b/src/nnfusion/core/kernels/kernel_registration.cpp
@@ -4,7 +4,6 @@
 #include "kernel_registration.hpp"
 #include "nnfusion/common/type/element_type.hpp"
 #include "nnfusion/util/util.hpp"
-#include "nnfusion/util/util.hpp"
 
 using namespace nnfusion;
 using namespace nnfusion::kernels;

From 40716bb6effe4d7a9ef064570085d0f9ddfa11cd Mon Sep 17 00:00:00 2001
From: Niupple <niupple@gmail.com>
Date: Wed, 10 Mar 2021 16:47:03 +0800
Subject: [PATCH 32/32] fix a semicolon missing

---
 src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
index 5896bbcae..9d84ff321 100644
--- a/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
+++ b/src/nnfusion/core/kernels/cuda_gpu/kernels/dot.cpp
@@ -312,7 +312,7 @@ LanguageUnit_p cuda::Dot::emit_function_body()
     }
     else
     {
-        NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot."
+        NNFUSION_CHECK_FAIL() << "Unsupported datatype " << dtype << " for nernel dot.";
     }
     //lu.block_end();
     return _lu;