From 73db701b4a4d4d3153829c7bf6ce5a45a954555c Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Mon, 23 Jan 2017 18:25:25 -0800
Subject: [PATCH 01/15] Merge changes from github. Change: 145363673

---
 tensorflow/core/kernels/conv_ops.cc | 4 ++++
 1 file changed, 4 insertions(+)
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index facfe4467d..ecf6c0acd9 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -229,6 +229,10 @@ class LaunchXsmmConvOp<CPUDevice, float> {
       return false;
     }
 
+    if (!CanUseXsmmConv2D(desc, data_format)) {
+      return false;
+    }
+
     auto input_ptr = input.template flat<float>().data();
     auto filter_ptr = filter.template flat<float>().data();
     auto output_ptr = output->template flat<float>().data();

From 9860fe12b4a4402277b7b0ae31e32ab0fb33a15c Mon Sep 17 00:00:00 2001
From: taknevski <taknevski@users.noreply.github.com>
Date: Mon, 6 Feb 2017 21:03:13 -0800
Subject: [PATCH 02/15] added backward handle caching and hash modification
 (#12)

---
 tensorflow/core/kernels/xsmm_conv2d.cc | 73 +++++++++++++++-----------
 1 file changed, 43 insertions(+), 30 deletions(-)

diff --git a/tensorflow/core/kernels/xsmm_conv2d.cc b/tensorflow/core/kernels/xsmm_conv2d.cc
index 823cdf7e09..faac3f4f85 100644
--- a/tensorflow/core/kernels/xsmm_conv2d.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@@ -136,6 +136,18 @@ class libxsmm_dnn_conv_desc_wrap{
  
 struct HashFunction{
   std::size_t operator()(const libxsmm_dnn_conv_desc_wrap & w) const{
+   
+    
+
+
+    //unsigned char ptr[sizeof(&w.d)];
+
+    
+    //memcpy(ptr, (unsigned char *)&w.d, sizeof(&w.d))
+                                       
+
+    //
+    /*
     std::ostringstream N,C,H,W,K,R,S,u,v,padh,padw;
  
     N << w.d.N; C << w.d.C;
@@ -144,28 +156,28 @@ struct HashFunction{
     S << w.d.S; u << w.d.u;
     v << w.d.v; padh << w.d.pad_h_in;
     padw << w.d.pad_w_in;
-
-
+ 
+ 
     std::string out_ =   N.str() + C.str()\
                        + H.str() + W.str()\
                        + K.str() + R.str()\
                        + S.str() + u.str()\
                        + v.str() + padh.str()\
                        + padw.str();
-
-    return ( std::hash<std::string>()(out_));
+    //
+    //
+    */
+    return ( std::hash<unsigned long long>()((unsigned long long)&(w.d)));
   }
 };
-
+ 
 class handles{
   public:
     libxsmm_dnn_layer* find( const libxsmm_dnn_conv_desc_wrap &w) {
-      std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*,
-                         HashFunction>::iterator i = libxsmm_handles.find(w);
+      std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction>::iterator i = libxsmm_handles.find(w);
       if (i == libxsmm_handles.end()){
         libxsmm_dnn_err_t status;
-        libxsmm_dnn_layer* libxsmm_handle =
-            libxsmm_dnn_create_conv_layer(w.d, &status);
+        libxsmm_dnn_layer* libxsmm_handle = libxsmm_dnn_create_conv_layer(w.d, &status);
         chk_libxsmm_err(status, "Create handle");
         libxsmm_handles.insert(std::make_pair(w, libxsmm_handle));
         return libxsmm_handle;
@@ -174,14 +186,15 @@ class handles{
         return i->second;
     }
    ~handles(){
-     std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*,
-                        HashFunction>::iterator i;
+    std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction>::iterator i;
     for (i= libxsmm_handles.begin(); i != libxsmm_handles.end(); i++)
       chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(i->second),
                     "Destroy handle");
     }
   private:
+ 
     std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction> libxsmm_handles;
+ 
 };
 
 static handles libxsmm_handles;
@@ -198,12 +211,12 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   libxsmm_dnn_conv_desc_wrap w(desc);
   void* scratch;
  
-  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD)
-    libxsmm_handle = libxsmm_handles.find(w);
-  else {
-    libxsmm_handle = libxsmm_dnn_create_conv_layer(desc, &status);
-    chk_libxsmm_err(status, "Create handle");
-  }
+  //if(kind == LIBXSMM_DNN_COMPUTE_KIND_FWD)
+  libxsmm_handle = libxsmm_handles.find(w);
+  //else{
+  //  libxsmm_handle = libxsmm_dnn_create_conv_layer(desc, &status);
+  //  chk_libxsmm_err(status, "Create handle");
+  //}
   
   status = libxsmm_dnn_get_codegen_success(libxsmm_handle, kind);
   if (status == LIBXSMM_DNN_WARN_FALLBACK) {
@@ -216,23 +229,23 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   libxsmm_dnn_buffer* libxsmm_input;
   libxsmm_dnn_buffer* libxsmm_output;
   libxsmm_dnn_filter* libxsmm_filter;
-
- /*
+  
+ /* 
   const DeviceBase::CpuWorkerThreads* worker_threads =
       ctx->device()->tensorflow_cpu_worker_threads();
-
+ 
   int num_threads = worker_threads->num_threads;
 */
 
   int ifmblock = (libxsmm_handle->ifmblock);
-  int ofmblock = (libxsmm_handle->ofmblock);
+  int ofmblock = (libxsmm_handle->ofmblock); 
 
-  int blocksifm = desc.C%ifmblock ==0 ? desc.C/ifmblock :desc.C/ifmblock + 1;
+  int blocksifm = desc.C%ifmblock ==0 ? desc.C/ifmblock :desc.C/ifmblock + 1;           
   int blocksofm = desc.K%ofmblock ==0 ? desc.K/ofmblock :desc.K/ofmblock + 1;
-  float *native_filter = (float*)libxsmm_aligned_scratch(
-      blocksofm*blocksifm*desc.R*desc.S*ifmblock*ofmblock*sizeof(float),
-      2097152);
+  float *native_filter = (float*)libxsmm_aligned_scratch( blocksofm*blocksifm*desc.R*desc.S*ifmblock*ofmblock*sizeof(float), 2097152);
+ 
 
+  
   const DeviceBase::CpuWorkerThreads* worker_threads =
       ctx->device()->tensorflow_cpu_worker_threads();
 
@@ -301,9 +314,9 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   }
 
   /* bind scratch */
-  scratch = (void*)libxsmm_aligned_scratch( libxsmm_dnn_get_scratch_size( libxsmm_handle, kind, &status ), 2097152);
+  scratch = (void*)libxsmm_aligned_scratch( libxsmm_dnn_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ), 2097152);
   chk_libxsmm_err( status, "scratch allocation" );
-  chk_libxsmm_err( libxsmm_dnn_bind_scratch( libxsmm_handle, kind, scratch ), "binding scratch" );
+  chk_libxsmm_err( libxsmm_dnn_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ), "binding scratch" );
 
   if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
     libxsmm_dnn_transpose_filter(libxsmm_handle, LIBXSMM_DNN_FILTER);
@@ -335,9 +348,9 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   chk_libxsmm_err(libxsmm_dnn_destroy_buffer(libxsmm_output), "Destroy output");
   chk_libxsmm_err(libxsmm_dnn_destroy_filter(libxsmm_filter), "Destroy filter");
   
-  if(kind != LIBXSMM_DNN_COMPUTE_KIND_FWD)
-    chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
-                  "Destroy handle");
+  //if(kind != LIBXSMM_DNN_COMPUTE_KIND_FWD)
+  //chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
+  //               "Destroy handle");
 
   libxsmm_free(native_filter);
   libxsmm_free(scratch);

From f799898a017369f3d1e85ad8972b155eaac9f337 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hfp@users.noreply.github.com>
Date: Mon, 20 Feb 2017 21:11:19 +0100
Subject: [PATCH 03/15] Preparation for LIBXSMM 1.8 (#13)

* Fixed libxsmm_config_arguments: Fixed the incorrect value supposed to trigger auto-prefetch. Fixed the 0-threshold, which is now accounted for in LIBXSMM (by just populating the default threshold). The problem arised from the assumption "threshold: fallback to BLAS if n*m*k above this", which is wrong (the threshold populates an upper bound until which JIT code is generated). The previous configuration perhaps caused all sorts of issues due to other values derived from the 0-threshold. Note, explicitly JIT'ting code is/was never subject to a threshold.

* Upgraded to libxsmm 1.6.5

* Enable the use of libxsmm for matrix multiplications

* Enable the use of libxsmm to speedup 1x1 convolutions (which are
computed using matrix multiplications)

* Fixed libxsmm_config_arguments in libxsmm.BUILD (#7)

* Fixed libxsmm_config_arguments: Fixed the incorrect value supposed to trigger auto-prefetch. Fixed the 0-threshold, which is now accounted for in LIBXSMM (by just populating the default threshold). The problem arised from the assumption "threshold: fallback to BLAS if n*m*k above this", which is wrong (the threshold populates an upper bound until which JIT code is generated). The previous configuration perhaps caused all sorts of issues due to other values derived from the 0-threshold. Note, explicitly JIT'ting code is/was never subject to a threshold.

* Make use of TensorFlow's allocation infrastructure even when using LIBXSMM allocation functions. In particular, the (cached) libxsmm_spmdm_init now relies on TF's cpu_allocator().

For C++ code, one can use a libxsmm_scoped_allocator<kind> in order to (temporarily) setup a different allocation mechanism. For instance, using libxsmm_tf_allocator<libxsmm_scratch_allocator> changes LIBXSMM's scratch allocator to rely on TensorFlow. The libxsmm_tf_allocator provides two kinds of c'tors: (1) the no-argument variant adopts TF's cpu_allocator(), whereas the one-argument form (2) adopts the allocator from the given OpKernelContext. Changing the allocator in LIBXSMM with pending buffers (from different allocators) is valid, and all other services in LIBXSMM's "malloc domain" work regardless of the allocation mechanism (e.g., libxsmm_malloc_size).

* Simply renamed API items in order to follow changes in LIBXSMM 1.7. This is incomplete as more changes/adjustments are needed.

* Account for removed non-check API.

* Include libxsmm_malloc.h now that libxsmm_tf_allocator is used.

* Renamed libxsmm_dnn_create_conv_handle to libxsmm_dnn_create_conv_layer.

* Renamed LIBXSMM_DNN_CONV_FORMAT_* to LIBXSMM_DNN_TENSOR_FORMAT_*.

* Renamed libxsmm_dnn_destroy_conv_handle to libxsmm_dnn_destroy_conv_layer.

* Include missing header file (libxsmm_malloc.h).

* Renamed LIBXSMM_DNN_CONV_KIND_* to LIBXSMM_DNN_COMPUTE_KIND_*.

* Account for the fact that datatype_in/out is now only datatype (libxsmm_dnn_conv_desc structure).

* Updated to new libxsmm_dnn_link_* functions.

* Updated to use new libxsmm_dnn_bind_* functions.

* Fixed calling libxsmm_dnn_transpose_filter.

* Updates in preparation of LIBXSMM 1.7 (#8)

* Fixed libxsmm_config_arguments: Fixed the incorrect value supposed to trigger auto-prefetch. Fixed the 0-threshold, which is now accounted for in LIBXSMM (by just populating the default threshold). The problem arised from the assumption "threshold: fallback to BLAS if n*m*k above this", which is wrong (the threshold populates an upper bound until which JIT code is generated). The previous configuration perhaps caused all sorts of issues due to other values derived from the 0-threshold. Note, explicitly JIT'ting code is/was never subject to a threshold.

* Upgraded to libxsmm 1.6.5

* Enable the use of libxsmm for matrix multiplications

* Enable the use of libxsmm to speedup 1x1 convolutions (which are
computed using matrix multiplications)

* Make use of TensorFlow's allocation infrastructure even when using LIBXSMM allocation functions. In particular, the (cached) libxsmm_spmdm_init now relies on TF's cpu_allocator().

For C++ code, one can use a libxsmm_scoped_allocator<kind> in order to (temporarily) setup a different allocation mechanism. For instance, using libxsmm_tf_allocator<libxsmm_scratch_allocator> changes LIBXSMM's scratch allocator to rely on TensorFlow. The libxsmm_tf_allocator provides two kinds of c'tors: (1) the no-argument variant adopts TF's cpu_allocator(), whereas the one-argument form (2) adopts the allocator from the given OpKernelContext. Changing the allocator in LIBXSMM with pending buffers (from different allocators) is valid, and all other services in LIBXSMM's "malloc domain" work regardless of the allocation mechanism (e.g., libxsmm_malloc_size).

* Simply renamed API items in order to follow changes in LIBXSMM 1.7. This is incomplete as more changes/adjustments are needed.

* Account for removed non-check API.

* Include libxsmm_malloc.h now that libxsmm_tf_allocator is used.

* Renamed libxsmm_dnn_create_conv_handle to libxsmm_dnn_create_conv_layer.

* Renamed LIBXSMM_DNN_CONV_FORMAT_* to LIBXSMM_DNN_TENSOR_FORMAT_*.

* Renamed libxsmm_dnn_destroy_conv_handle to libxsmm_dnn_destroy_conv_layer.

* Include missing header file (libxsmm_malloc.h).

* Renamed LIBXSMM_DNN_CONV_KIND_* to LIBXSMM_DNN_COMPUTE_KIND_*.

* Account for the fact that datatype_in/out is now only datatype (libxsmm_dnn_conv_desc structure).

* Updated to new libxsmm_dnn_link_* functions.

* Updated to use new libxsmm_dnn_bind_* functions.

* Fixed calling libxsmm_dnn_transpose_filter.

* integrated LIBXSMM 1.7

* support for LIBXSMM 1.7 (#9)

* Upgraded to libxsmm 1.6.5

* Enable the use of libxsmm for matrix multiplications

* Enable the use of libxsmm to speedup 1x1 convolutions (which are
computed using matrix multiplications)

* integrated LIBXSMM 1.7

* updated LIBXSMM to 1.7.1

* updated to LIBXSMM 1.7.1 (#10)

* Upgraded to libxsmm 1.6.5

* Enable the use of libxsmm for matrix multiplications

* Enable the use of libxsmm to speedup 1x1 convolutions (which are
computed using matrix multiplications)

* integrated LIBXSMM 1.7

* updated LIBXSMM to 1.7.1

* merge alheinecke master (#11)

* Upgraded to libxsmm 1.6.5

* Enable the use of libxsmm for matrix multiplications

* Enable the use of libxsmm to speedup 1x1 convolutions (which are
computed using matrix multiplications)

* integrated LIBXSMM 1.7

* updated LIBXSMM to 1.7.1

* Take new translation units into account (LIBXSMM 1.8).

* Account for adjusted header dependency in LIBXSMM (TODO: API to avoid incl. header from LIBXSMM's src).

* Trigger rebuild if template changed (LIBXSMM).
---
 tensorflow/core/kernels/xsmm_conv2d.cc |  4 ++--
 third_party/libxsmm.BUILD              | 32 +++++++++++++++-----------
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/tensorflow/core/kernels/xsmm_conv2d.cc b/tensorflow/core/kernels/xsmm_conv2d.cc
index faac3f4f85..725358658d 100644
--- a/tensorflow/core/kernels/xsmm_conv2d.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@@ -32,8 +32,8 @@ void dummy_xsmm_conv2d_ensure_file_is_not_empty(void);
 #include "tensorflow/core/lib/core/threadpool.h"
 
 #include "include/libxsmm_cpuid.h"
-#include "libxsmm_dnn_handle.h"
-#include "libxsmm_malloc.h"
+#include "include/libxsmm_malloc.h"
+#include "libxsmm_main.h" // TODO: API to avoid incl. header from src/
 
 namespace tensorflow {
 
diff --git a/third_party/libxsmm.BUILD b/third_party/libxsmm.BUILD
index 037009c072..32f48d9fd2 100644
--- a/third_party/libxsmm.BUILD
+++ b/third_party/libxsmm.BUILD
@@ -56,22 +56,26 @@ genrule(
 cc_library(
     name = "xsmm_avx",
     srcs = [
-        "src/libxsmm_main.c",
+        "src/libxsmm_cpuid_x86.c",
+        "src/libxsmm_dnn.c",
+        "src/libxsmm_dnn_convolution_backward.c",
+        "src/libxsmm_dnn_convolution_forward.c",
+        "src/libxsmm_dnn_convolution_weight_update.c",
+        "src/libxsmm_dnn_convolution_winograd_backward.c",
+        "src/libxsmm_dnn_convolution_winograd_forward.c",
+        "src/libxsmm_dnn_convolution_winograd_weight_update.c",
+        "src/libxsmm_dnn_handle.c",
         "src/libxsmm_dump.c",
-        "src/libxsmm_malloc.c",
+        "src/libxsmm_fsspmdm.c",
         "src/libxsmm_gemm.c",
+        "src/libxsmm_main.c",
+        "src/libxsmm_malloc.c",
+        "src/libxsmm_perf.c",
+        "src/libxsmm_spmdm.c",
+        "src/libxsmm_sync.c",
         "src/libxsmm_timer.c",
         "src/libxsmm_trace.c",
         "src/libxsmm_trans.c",
-        "src/libxsmm_sync.c",
-        "src/libxsmm_perf.c",
-        "src/libxsmm_spmdm.c",
-        "src/libxsmm_dnn.c",
-        "src/libxsmm_dnn_handle.c",
-        "src/libxsmm_dnn_convolution_forward.c",
-        "src/libxsmm_dnn_convolution_backward.c",
-        "src/libxsmm_dnn_convolution_weight_update.c",
-        "src/libxsmm_cpuid_x86.c",
     ] + glob([
         "src/generator_*.c",
     ]),
@@ -79,6 +83,7 @@ cc_library(
         "include/libxsmm_cpuid.h",
         "include/libxsmm_dnn.h",
         "include/libxsmm_frontend.h",
+        "include/libxsmm_fsspmdm.h",
         "include/libxsmm_generator.h",
         "include/libxsmm_intrinsics_x86.h",
         "include/libxsmm_macros.h",
@@ -91,14 +96,15 @@ cc_library(
         "include/libxsmm.h",
         "include/libxsmm_config.h",
         "include/libxsmm_dispatch.h",
-    ],
+    ] + glob([ # trigger rebuild if template changed
+        "src/template/*.c",
+    ]),
     copts = [
         "-mavx",  # JIT does not work without avx anyway, and this silences some CRC32 warnings.
         "-Wno-vla",  # Libxsmm convolutions heavily use VLA.
     ],
     defines = [
         "LIBXSMM_BUILD",
-        "LIBXSMM_CPUID_X86_NOINLINE",
         "__BLAS=0",
     ],
     includes = [

From 318520e2a7584ea4b7682c037fb2c2fe4ba45c90 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Wed, 22 Feb 2017 11:43:02 -0800
Subject: [PATCH 04/15] Upgraded to the latest version of libxsmm to fix build
 errors

---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index dac04440d0..652ee13760 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -89,11 +89,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "libxsmm_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/hfp/libxsmm/archive/1.7.1.tar.gz",
-          "https://github.com/hfp/libxsmm/archive/1.7.1.tar.gz",
+          "http://bazel-mirror.storage.googleapis.com/github.com/hfp/libxsmm/archive/940a1dc6173fef49881a69b9249e4bbaf3380d09.tar.gz",
+          "https://github.com/hfp/libxsmm/archive/940a1dc6173fef49881a69b9249e4bbaf3380d09.tar.gz",
       ],
-      sha256 = "9d3f63ce3eed62f04e4036de6f2be2ce0ff07781ca571af6e0bf85b077edf17a",
-      strip_prefix = "libxsmm-1.7.1",
+      sha256 = "dced27a393abf169594ff619dde6879c3220c39fdd83e4cd228553d4f8cd2a92",
+      strip_prefix = "libxsmm-940a1dc6173fef49881a69b9249e4bbaf3380d09",
       build_file = str(Label("//third_party:libxsmm.BUILD")),
   )
 

From e059269ef4a278a376b267b84b3b7933a82bbb86 Mon Sep 17 00:00:00 2001
From: Hans Pabst <hfp@users.noreply.github.com>
Date: Wed, 8 Mar 2017 16:23:49 +0100
Subject: [PATCH 05/15] Fixed calling libxsmm_spmdm_* by supplying 'T' for
 "transpose" instead of 'Y' (a.k.a. "Yes"). This fixes the issue unveiled by
 some of the regression tests (Thank you for reporting!). (#15)

---
 tensorflow/core/kernels/sparse_matmul_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 2ed0522ce4..95faff0b95 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -1642,6 +1642,7 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
                             actual_num_threads);
                       }
                     });
+
   // Do matrix-matrix multiplication
   std::atomic<int> cur_mult_block_number;
   cur_mult_block_number.store(0);

From d9c25cd34819f84e5d56362cab73ee762e502e46 Mon Sep 17 00:00:00 2001
From: Alexander Heinecke <alexander.heinecke@intel.com>
Date: Wed, 8 Mar 2017 20:48:27 -0800
Subject: [PATCH 06/15] improved LIBXSMM integration (#16)

---
 .../core/kernels/conv_grad_filter_ops.cc      |  3 ++
 .../core/kernels/conv_grad_input_ops.cc       | 23 +++++++----
 tensorflow/core/kernels/conv_ops.cc           |  6 +--
 tensorflow/core/kernels/xsmm_conv2d.cc        | 41 ++++++++++++++-----
 4 files changed, 52 insertions(+), 21 deletions(-)

diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 2e385f2c55..7d67fb3a7d 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -30,6 +30,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/conv_2d.h"
+#ifdef TENSORFLOW_USE_LIBXSMM
+#include "tensorflow/core/kernels/xsmm_conv2d.h"
+#endif
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 8bc79bebd9..8cd7f5db7e 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -131,7 +131,7 @@ struct LaunchXsmmBackwardInputConvolution {
                   typename TTypes<T, 4>::ConstTensor kernel,
                   typename TTypes<T, 4>::ConstTensor output_backward,
                   int input_rows, int input_cols, int row_stride,
-                  int col_stride, TensorFormat data_format) const {
+                  int col_stride, int pad_h, int pad_w, TensorFormat data_format) const {
     return false;
   }
 };
@@ -143,7 +143,7 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
                   typename TTypes<float, 4>::ConstTensor kernel,
                   typename TTypes<float, 4>::ConstTensor output_backward,
                   int input_rows, int input_cols, int row_stride,
-                  int col_stride, TensorFormat data_format) const {
+                  int col_stride, int pad_h, int pad_w, TensorFormat data_format) const {
     auto batch = input_backward.dimension(0);
     auto in_depth = input_backward.dimension(3);
     auto out_depth = output_backward.dimension(3);
@@ -162,10 +162,10 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
     desc.S = filter_cols;
     desc.u = row_stride;
     desc.v = col_stride;
-    desc.pad_h = 0;
-    desc.pad_w = 0;
-    desc.pad_h_in = 0;  // pad_rows;  // ignored by libxsmm for now.
-    desc.pad_w_in = 0;  // pad_cols;  // ignored by libxsmm for now.
+    desc.pad_h = pad_h;
+    desc.pad_w = pad_w;
+    desc.pad_h_in = 0;
+    desc.pad_w_in = 0;
     desc.pad_h_out = 0;
     desc.pad_w_out = 0;
     desc.threads = num_threads;
@@ -174,7 +174,7 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
     desc.filter_format =
         LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;  // LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
-    desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+    desc.options = LIBXSMM_DNN_CONV_OPTION_WU_EXT_FILTER_REDUCE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
 
     auto input_ptr = input_backward.data();
@@ -236,6 +236,15 @@ class Conv2DFastBackpropInputOp : public OpKernel {
                    context->allocate_output(0, input_shape, &in_backprop));
 
 #if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
+/*
+    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_rows, filter_rows, stride_rows,
+                                         padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
+                                         padding_, &out_cols, &pad_cols));
+*/
     if (LaunchXsmmBackwardInputConvolution<Device, T>()(
             context, context->eigen_device<Device>(),
             in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index ecf6c0acd9..8076daf387 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -213,8 +213,8 @@ class LaunchXsmmConvOp<CPUDevice, float> {
     desc.v = stride_cols;
     desc.pad_h = pad_rows;
     desc.pad_w = pad_cols;
-    desc.pad_h_in = pad_rows;  // libxsmm supports only physical padding for now
-    desc.pad_w_in = pad_cols;  // libxsmm supports only physical padding for now
+    desc.pad_h_in = 0;
+    desc.pad_w_in = 0;
     desc.pad_h_out = 0;
     desc.pad_w_out = 0;
     desc.threads = num_threads;
@@ -222,7 +222,7 @@ class LaunchXsmmConvOp<CPUDevice, float> {
     desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
     desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
-    desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+    desc.options = LIBXSMM_DNN_CONV_OPTION_WU_EXT_FILTER_REDUCE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
 
     if (!CanUseXsmmConv2D(desc, data_format)) {
diff --git a/tensorflow/core/kernels/xsmm_conv2d.cc b/tensorflow/core/kernels/xsmm_conv2d.cc
index 725358658d..05a5fbd8d7 100644
--- a/tensorflow/core/kernels/xsmm_conv2d.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@@ -59,10 +59,6 @@ bool CanUseXsmmConv2D(const libxsmm_dnn_conv_desc& desc,
     VLOG(1) << "Cannot use XSMM convolutions: unsupported format!";
     return false;
   }
-  if (desc.pad_h_in != 0 || desc.pad_w_in != 0) {
-    VLOG(1) << "Cannot use XSMM convolutions: unsupported padding!";
-    return false;
-  }
   if (desc.K % VECTOR_SIZE != 0) {
     VLOG(1) << "Cannot use XSMM convolutions: output features count not"
                " divisible by vector size!";
@@ -127,8 +123,8 @@ class libxsmm_dnn_conv_desc_wrap{
               d.S == w.d.S &&
               d.u == w.d.u &&
               d.v == w.d.v &&
-              d.pad_h_in == w.d.pad_h_in &&
-              d.pad_w_in == w.d.pad_w_in
+              d.pad_h == w.d.pad_h &&
+              d.pad_w == w.d.pad_w
             );
     }
 };
@@ -292,10 +288,9 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
       libxsmm_handle, LIBXSMM_DNN_FILTER, native_filter, LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
   chk_libxsmm_err(status, "Link filter");
 
-  chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_output), "Zero output");
-
-
   if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD) {
+    chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_output), "Zero output");
+
     chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT),
                     "Bind input forward");
     chk_libxsmm_err(
@@ -303,7 +298,9 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
         "Bind output forward");
     chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER),
                     "Bind filter forward");
-  } else {
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_input), "Zero input");
+
     chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input, LIBXSMM_DNN_GRADIENT_INPUT),
                     "Bind input backward");
     chk_libxsmm_err(
@@ -311,6 +308,18 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
         "Bind output backward");
     chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER),
                     "Bind filter backward");
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    chk_libxsmm_err(libxsmm_dnn_zero_filter(libxsmm_filter), "Zero filter");
+
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT),
+                    "Bind input weight udpate");
+    chk_libxsmm_err(
+        libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output, LIBXSMM_DNN_GRADIENT_OUTPUT),
+        "Bind output weight update");
+    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_GRADIENT_FILTER),
+                    "Bind filter weight update");
+  } else {
+    /* shouldn't happen */
   }
 
   /* bind scratch */
@@ -333,16 +342,26 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   }
   counter.Wait();
 
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    libxsmm_dnn_reduce_wu_filters( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER );
+  }
+
   /* clean up */
   chk_libxsmm_err( libxsmm_dnn_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ), "release scratch" );
   if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD) {
     chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ), "release input" );
     chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ), "release output" );
     chk_libxsmm_err( libxsmm_dnn_release_filter( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ), "release filter" );
-  } else {
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
     chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ), "release input" );
     chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ), "release output" );
     chk_libxsmm_err( libxsmm_dnn_release_filter( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ), "release filter" );
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ), "release input" );
+    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ), "release output" );
+    chk_libxsmm_err( libxsmm_dnn_release_filter( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER ), "release filter" );
+  } else {
+    /* shouldn't happen */
   }
   chk_libxsmm_err(libxsmm_dnn_destroy_buffer(libxsmm_input), "Destroy input");
   chk_libxsmm_err(libxsmm_dnn_destroy_buffer(libxsmm_output), "Destroy output");

From 57783b75e0a2041c45a0ddb004dcb7f699e3d981 Mon Sep 17 00:00:00 2001
From: Alexander Heinecke <alexander.heinecke@intel.com>
Date: Thu, 9 Mar 2017 16:42:17 -0800
Subject: [PATCH 07/15] update for detailed timing of LIBXSMM (#18)

* improved LIBXSMM integration

* added some timers to xsmm_conv2d

* added detailed timing report for LIBXSMM convolutions
---
 tensorflow/core/kernels/xsmm_conv2d.cc | 82 +++++++++++++++++++++++---
 1 file changed, 73 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/kernels/xsmm_conv2d.cc b/tensorflow/core/kernels/xsmm_conv2d.cc
index 05a5fbd8d7..b4d9a83375 100644
--- a/tensorflow/core/kernels/xsmm_conv2d.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@@ -26,6 +26,10 @@ void dummy_xsmm_conv2d_ensure_file_is_not_empty(void);
 #include "tensorflow/core/kernels/xsmm_conv2d.h"
 
 #include <stdlib.h>
+#include <cstring>
+#if 0
+#include <omp.h>
+#endif
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
@@ -195,11 +199,17 @@ class handles{
 
 static handles libxsmm_handles;
 
+//#define LIBXSMM_DETAILED_TIMING
+
 template <typename InputPtr, typename FilterPtr, typename OutputPtr>
 static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
                                    const libxsmm_dnn_conv_desc& desc,
                                    libxsmm_dnn_compute_kind kind, InputPtr input,
                                    FilterPtr filter, OutputPtr output) {
+#if defined(LIBXSMM_DETAILED_TIMING)
+  unsigned long long l_tick1, l_tick2, l_tick3, l_tick4, l_tick5, l_tick6, l_tick7, l_tick8, l_tick9, l_tick10;
+  l_tick1 = libxsmm_timer_tick();
+#endif
   // setup scoped allocator, which adopts the allocator from the context
   const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator(*ctx);
   libxsmm_dnn_err_t status;
@@ -225,13 +235,10 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   libxsmm_dnn_buffer* libxsmm_input;
   libxsmm_dnn_buffer* libxsmm_output;
   libxsmm_dnn_filter* libxsmm_filter;
-  
- /* 
-  const DeviceBase::CpuWorkerThreads* worker_threads =
-      ctx->device()->tensorflow_cpu_worker_threads();
- 
-  int num_threads = worker_threads->num_threads;
-*/
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick2 = libxsmm_timer_tick();
+#endif
 
   int ifmblock = (libxsmm_handle->ifmblock);
   int ofmblock = (libxsmm_handle->ofmblock); 
@@ -241,13 +248,12 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   float *native_filter = (float*)libxsmm_aligned_scratch( blocksofm*blocksifm*desc.R*desc.S*ifmblock*ofmblock*sizeof(float), 2097152);
  
 
-  
   const DeviceBase::CpuWorkerThreads* worker_threads =
       ctx->device()->tensorflow_cpu_worker_threads();
 
   int num_threads = worker_threads->num_threads;
 
-
+#if 1  
   if(blocksofm > num_threads){
     int work = blocksofm;
     BlockingCounter count(num_threads);
@@ -277,6 +283,13 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
     }
     count.Wait();
   }
+#else
+  memset( native_filter, 0, blocksofm*blocksifm*desc.R*desc.S*ifmblock*ofmblock*sizeof(float));
+#endif
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick3 = libxsmm_timer_tick();
+#endif
 
   libxsmm_input = libxsmm_dnn_link_buffer(
       libxsmm_handle, LIBXSMM_DNN_INPUT, input, LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
@@ -322,15 +335,28 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
     /* shouldn't happen */
   }
 
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick4 = libxsmm_timer_tick();
+#endif
+
   /* bind scratch */
   scratch = (void*)libxsmm_aligned_scratch( libxsmm_dnn_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ), 2097152);
   chk_libxsmm_err( status, "scratch allocation" );
   chk_libxsmm_err( libxsmm_dnn_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ), "binding scratch" );
 
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick5 = libxsmm_timer_tick();
+#endif
+
   if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
     libxsmm_dnn_transpose_filter(libxsmm_handle, LIBXSMM_DNN_FILTER);
   }
 
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick6 = libxsmm_timer_tick();
+#endif
+
+#if 1
   BlockingCounter counter(num_threads);
   
   for (int i = 0; i < num_threads; ++i) {
@@ -341,6 +367,24 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
     });
   }
   counter.Wait();
+#else
+  #pragma omp parallel
+  {
+    chk_libxsmm_err(libxsmm_dnn_execute_st(libxsmm_handle, kind, 0, omp_get_thread_num()), "Worker");
+  }
+#endif
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick7 = libxsmm_timer_tick();
+#endif
+
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    libxsmm_dnn_reduce_wu_filters( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER );
+  }
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick8 = libxsmm_timer_tick();
+#endif
 
   if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
     libxsmm_dnn_reduce_wu_filters( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER );
@@ -366,6 +410,10 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   chk_libxsmm_err(libxsmm_dnn_destroy_buffer(libxsmm_input), "Destroy input");
   chk_libxsmm_err(libxsmm_dnn_destroy_buffer(libxsmm_output), "Destroy output");
   chk_libxsmm_err(libxsmm_dnn_destroy_filter(libxsmm_filter), "Destroy filter");
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick9 = libxsmm_timer_tick();
+#endif
   
   //if(kind != LIBXSMM_DNN_COMPUTE_KIND_FWD)
   //chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
@@ -373,6 +421,22 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
 
   libxsmm_free(native_filter);
   libxsmm_free(scratch);
+
+#if defined(LIBXSMM_DETAILED_TIMING)
+  l_tick10 = libxsmm_timer_tick();
+  printf("time for convolution (%i, %i, %i, %i, %i): %f, %f, %f, %f, %f, %f, %f, %f, %f, %f\n", desc.N, desc.C, desc.K, desc.R, desc.S, 
+                                                                                      libxsmm_timer_duration(l_tick1, l_tick2),
+                                                                                      libxsmm_timer_duration(l_tick2, l_tick3),
+                                                                                      libxsmm_timer_duration(l_tick3, l_tick4),
+                                                                                      libxsmm_timer_duration(l_tick4, l_tick5),
+                                                                                      libxsmm_timer_duration(l_tick5, l_tick6),
+                                                                                      libxsmm_timer_duration(l_tick6, l_tick7),
+                                                                                      libxsmm_timer_duration(l_tick7, l_tick8),
+                                                                                      libxsmm_timer_duration(l_tick8, l_tick9),
+                                                                                      libxsmm_timer_duration(l_tick9, l_tick10),
+                                                                                      libxsmm_timer_duration(l_tick1, l_tick10)  );
+#endif
+
   return true;  // Succeeded
 }
 

From b73658f9772a92f347e6fcb9521332305c61b922 Mon Sep 17 00:00:00 2001
From: Alexander Heinecke <alexander.heinecke@intel.com>
Date: Thu, 9 Mar 2017 16:45:31 -0800
Subject: [PATCH 08/15] update for previous merge (#19)

* improved LIBXSMM integration

* added some timers to xsmm_conv2d

* added detailed timing report for LIBXSMM convolutions

* update
---
 tensorflow/core/kernels/xsmm_conv2d.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/core/kernels/xsmm_conv2d.cc b/tensorflow/core/kernels/xsmm_conv2d.cc
index b4d9a83375..034a49cbb5 100644
--- a/tensorflow/core/kernels/xsmm_conv2d.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@@ -386,10 +386,6 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   l_tick8 = libxsmm_timer_tick();
 #endif
 
-  if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
-    libxsmm_dnn_reduce_wu_filters( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER );
-  }
-
   /* clean up */
   chk_libxsmm_err( libxsmm_dnn_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ), "release scratch" );
   if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD) {

From 3c2742a3f920d83968569dd557b66466fd36484b Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Tue, 14 Mar 2017 14:25:11 -0700
Subject: [PATCH 09/15] FIxed merge issues

---
 tensorflow/core/kernels/sparse_matmul_op.cc | 280 +++++---------------
 tensorflow/core/kernels/sparse_matmul_op.h  |  16 +-
 2 files changed, 71 insertions(+), 225 deletions(-)

diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 95faff0b95..46e743b4cf 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -837,15 +837,6 @@ class SparseMatMul {
 };
 
 #ifdef TENSORFLOW_USE_LIBXSMM
-#ifdef EXTRA_CACHE_LOGGING
-static tensorflow::mutex global_cache_stats_lock;
-static int total_num_entries_outstanding GUARDED_BY(global_cache_stats_lock) =
-    0;
-static int total_num_entries_in_cache GUARDED_BY(global_cache_stats_lock) = 0;
-#endif  // EXTRA_CACHE_LOGGING
-
-static const int max_entries_per_graph_node = 40;
-
 template <typename TL, typename TR>
 class LibxsmmSparseMatMul {
   typedef Eigen::Tensor<TL, 2, Eigen::RowMajor> MatrixL;
@@ -861,7 +852,6 @@ class LibxsmmSparseMatMul {
       MatrixMapR;
 
  public:
-#if 1
   // This structure contains a set of libxsmm kernels for sizes that have been
   // encountered previously by this operator so that libxsmm does not need to
   // reallocate its scratchpad memory each time (which hurts performance
@@ -880,181 +870,57 @@ class LibxsmmSparseMatMul {
       // useful (it is an empty struct right now)
       typename SparseMatMul<TL, TR>::TensorInfoCache
           non_libxsmm_cache;  // Currently not used
-      TF_DISALLOW_COPY_AND_ASSIGN(TensorInfoCacheEntry);
-      ~TensorInfoCacheEntry() {
-#ifdef EXTRA_CACHE_LOGGING
-        LOG(INFO) << "Deleting tensor cache entry at " << (void*)this;
-#endif  // EXTRA_CACHE_LOGGING
-        libxsmm_spmdm_destroy(&handle);
-      }
     };
-    // protects entries; invariant: entries is a valid std::list.
+    // protects entries; invariant: entries is a valid std::multimap
     tensorflow::mutex lock;
     // Because there could be multiple matrix multiplies with the same sizes
     // going on at the same time, we need to allow multiple cache entries for a
     // given set of parameters. Taking and returning entries is used to make
     // sure the same cache entry is not used from two threads at a time.
-    using entries_map_type = std::list<std::pair<
-        std::tuple<int, int, int, int>,
-        std::unique_ptr<TensorInfoCacheEntry>>>;  // multimap in LRU order
-    entries_map_type entries GUARDED_BY(
-        lock);  // MRU element at end so reverse search will find it first
-    int num_entries_outstanding GUARDED_BY(lock);
-
-    TensorInfoCache() : lock(), entries(), num_entries_outstanding(0) {}
+    std::multimap<std::tuple<int, int, int, int>,
+                  std::unique_ptr<TensorInfoCacheEntry>>
+        entries GUARDED_BY(lock);
+
+    TensorInfoCache() : lock(), entries() {}
     // Look up and remove first entry with these parameters, creating one if
     // there isn't one
     std::unique_ptr<TensorInfoCacheEntry> take_cache_entry(int M, int K, int N,
                                                            int max_threads)
-#ifdef EXTRA_CACHE_LOGGING
-        LOCKS_EXCLUDED(lock, global_cache_stats_lock)
-#else
-        LOCKS_EXCLUDED(lock)
-#endif
-    {
+        LOCKS_EXCLUDED(lock) {
       tensorflow::mutex_lock ml(lock);
-#ifdef EXTRA_CACHE_LOGGING
-      tensorflow::mutex_lock ml2(global_cache_stats_lock);
-#endif
       auto key = std::make_tuple(M, K, N, max_threads);
-      auto it_rev =
-          std::find_if(entries.rbegin(), entries.rend(),
-                       [&](const typename entries_map_type::value_type& e) {
-                         return e.first == key;
-                       });
-      auto it =
-          (it_rev == entries.rend() ? entries.end() : std::next(it_rev).base());
+      auto it = entries.find(key);
       if (it != entries.end()) {
         auto val = std::move(it->second);
         entries.erase(it);
-        ++num_entries_outstanding;
-#ifdef EXTRA_CACHE_LOGGING
-        ++total_num_entries_outstanding;
-        --total_num_entries_in_cache;
-        LOG(INFO) << "Used existing cache entry at " << (void*)val.get()
-                  << " for " << M << "x" << K << "x" << N << " max_threads "
-                  << max_threads
-                  << ", num_entries_outstanding = " << num_entries_outstanding
-                  << ", new cache size = " << entries.size()
-                  << ", total num_entries_outstanding = "
-                  << total_num_entries_outstanding
-                  << ", total cache size = " << total_num_entries_in_cache;
-#endif
         return val;
       } else {
-        while (!entries.empty() &&
-               entries.size() + num_entries_outstanding + 1 >
-                   max_entries_per_graph_node) {
-#ifdef EXTRA_CACHE_LOGGING
-          LOG(INFO) << "Removing old cache entry at "
-                    << (void*)entries.front().second.get();
-#endif
-          entries.pop_front();
-        }
         std::unique_ptr<TensorInfoCacheEntry> e{
             new TensorInfoCacheEntry{M, K, N, max_threads, {}, nullptr}};
         // setup scoped allocator, which uses cpu_allocator() for this scope
         const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
         libxsmm_spmdm_init(M, N, K, max_threads, &e->handle, &e->output_csr);
-        ++num_entries_outstanding;
-#ifdef EXTRA_CACHE_LOGGING
-        ++total_num_entries_outstanding;
-        LOG(INFO) << "Created cache entry at " << (void*)e.get() << " for " << M
-                  << "x" << K << "x" << N << " max_threads " << max_threads
-                  << ", num_entries_outstanding = " << num_entries_outstanding
-                  << ", new cache size = " << entries.size()
-                  << ", total num_entries_outstanding = "
-                  << total_num_entries_outstanding
-                  << ", total cache size = " << total_num_entries_in_cache;
-#endif
         return e;
       }
     }
     // Add a cache entry with certain parameters
     void return_cache_entry(std::unique_ptr<TensorInfoCacheEntry> e)
-#ifdef EXTRA_CACHE_LOGGING
-        LOCKS_EXCLUDED(lock, global_cache_stats_lock)
-#else
-        LOCKS_EXCLUDED(lock)
-#endif
-    {
+        LOCKS_EXCLUDED(lock) {
       tensorflow::mutex_lock ml(lock);
-#ifdef EXTRA_CACHE_LOGGING
-      tensorflow::mutex_lock ml2(global_cache_stats_lock);
-#endif
       auto key = std::make_tuple(e->M, e->K, e->N, e->max_threads);
-      --num_entries_outstanding;
-#ifdef EXTRA_CACHE_LOGGING
-      --total_num_entries_outstanding;
-      LOG(INFO) << "Returned cache entry at " << (void*)e.get() << " for "
-                << e->M << "x" << e->K << "x" << e->N << " max_threads "
-                << e->max_threads
-                << ", num_entries_outstanding = " << num_entries_outstanding
-                << ", prev cache size = " << entries.size()
-                << ", total num_entries_outstanding = "
-                << total_num_entries_outstanding
-                << ", total cache size = " << total_num_entries_in_cache;
-#endif
-      entries.push_back(std::make_pair(key, std::move(e)));
-#ifdef EXTRA_CACHE_LOGGING
-      ++total_num_entries_in_cache;
-#endif
+      entries.insert(std::make_pair(key, std::move(e)));
     }
     ~TensorInfoCache() {
       tensorflow::mutex_lock ml(lock);
-#ifdef EXTRA_CACHE_LOGGING
-      tensorflow::mutex_lock ml2(global_cache_stats_lock);
-      LOG(INFO) << "Deleting TensorInfoCache, cache size = " << entries.size()
-                << ", total num_entries_outstanding = "
-                << total_num_entries_outstanding
-                << ", total cache size = " << total_num_entries_in_cache;
-#endif
-      CHECK_EQ(num_entries_outstanding, 0);
+      for (auto& p : entries) {
+        libxsmm_spmdm_destroy(&p.second->handle);
+      }
       entries.clear();
     }
 
    private:
     TF_DISALLOW_COPY_AND_ASSIGN(TensorInfoCache);
   };
-#else
-  // This structure contains a set of libxsmm kernels for sizes that have been
-  // encountered previously by this operator so that libxsmm does not need to
-  // reallocate its scratchpad memory each time (which hurts performance
-  // substantially).
-  struct TensorInfoCache {
-    struct TensorInfoCacheEntry {
-      // Parameters for kernel
-      int M;
-      int K;
-      int N;
-      int max_threads;
-      // libxsmm handle and matrix data
-      libxsmm_spmdm_handle handle;
-      libxsmm_CSR_sparseslice* output_csr;
-      // Chain to non-libxsmm implementation's cache in case that ever becomes
-      // useful (it is an empty struct right now)
-      typename SparseMatMul<TL, TR>::TensorInfoCache
-          non_libxsmm_cache;  // Currently not used
-    };
-    TensorInfoCache() {}
-    // Look up and remove first entry with these parameters, creating one if
-    // there isn't one
-    std::unique_ptr<TensorInfoCacheEntry> take_cache_entry(int M, int K, int N,
-                                                           int max_threads) {
-      std::unique_ptr<TensorInfoCacheEntry> e{
-          new TensorInfoCacheEntry{M, K, N, max_threads, {}, nullptr}};
-      libxsmm_spmdm_init(M, N, K, max_threads, &e->handle, &e->output_csr);
-      return e;
-    }
-    // Add a cache entry with certain parameters
-    void return_cache_entry(std::unique_ptr<TensorInfoCacheEntry> e) {
-      libxsmm_spmdm_destroy(&e->handle);
-    }
-
-   private:
-    TF_DISALLOW_COPY_AND_ASSIGN(TensorInfoCache);
-  };
-#endif
 
   // Perform matrix multiplication of "left" and "right", and store the result
   // in *"output".
@@ -1479,21 +1345,21 @@ inline void SparseMatMul<TL, TR>::ComputeBlockSizes(
 
 template <typename F>
 void do_on_all_threads(const DeviceBase::CpuWorkerThreads* thread_pool,
-                       ptrdiff_t max_thread_count, const F& f) {
+                       const F& f) {
   int num_threads = thread_pool->num_threads;
   if (num_threads == 0) {
     LOG(FATAL) << "Have 0 threads in thread pool";
   } else if (num_threads == 1) {
-    f(0, 1);
+    f(0);
   } else {
     BlockingCounter counter(num_threads - 1);
     for (int i = 1; i < num_threads; ++i) {
       thread_pool->workers->Schedule([&, i]() {
-        f(i, num_threads);
+        f(i);
         counter.DecrementCount();
       });
     }
-    f(0, num_threads);
+    f(0);
     counter.Wait();
   }
 }
@@ -1522,24 +1388,21 @@ void wrapper_libxsmm_spmdm_createSparseSlice_generic_thread(
 
 void wrapper_libxsmm_spmdm_compute_generic_thread(
     empty_type_wrapper<bfloat16>, const libxsmm_spmdm_handle* handle,
-    char transA, char transB, libxsmm_CSR_sparseslice* A_sparse,
-    const bfloat16* B, char transC, float* C, int block_id, int tid,
-    int nthreads) {
-  const uint16 alpha = 1;
-  const uint16 beta = 0;
+    char transA, char transB, const bfloat16* alpha,
+    libxsmm_CSR_sparseslice* A_sparse, const bfloat16* B, char transC,
+    const bfloat16* beta, float* C, int block_id, int tid, int nthreads) {
   return libxsmm_spmdm_compute_bfloat16_thread(
-      handle, transA, transB, &alpha, A_sparse,
-      reinterpret_cast<const uint16*>(B), transC, &beta, C, block_id, tid,
-      nthreads);
+      handle, transA, transB, reinterpret_cast<const uint16*>(alpha), A_sparse,
+      reinterpret_cast<const uint16*>(B), transC,
+      reinterpret_cast<const uint16*>(beta), C, block_id, tid, nthreads);
 }
 void wrapper_libxsmm_spmdm_compute_generic_thread(
     empty_type_wrapper<float>, const libxsmm_spmdm_handle* handle, char transA,
-    char transB, libxsmm_CSR_sparseslice* A_sparse, const float* B, char transC,
-    float* C, int block_id, int tid, int nthreads) {
-  const float alpha = 1.f;
-  const float beta = 0.f;
-  return libxsmm_spmdm_compute_fp32_thread(handle, transA, transB, &alpha,
-                                           A_sparse, B, transC, &beta, C,
+    char transB, const float* alpha, libxsmm_CSR_sparseslice* A_sparse,
+    const float* B, char transC, const float* beta, float* C, int block_id,
+    int tid, int nthreads) {
+  return libxsmm_spmdm_compute_fp32_thread(handle, transA, transB, alpha,
+                                           A_sparse, B, transC, beta, C,
                                            block_id, tid, nthreads);
 }
 
@@ -1590,13 +1453,11 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
   const int left_dim1 = transpose_left ? left.dimension(0) : left.dimension(1);
   const int right_dim0 = right.dimension(0);
   const int right_dim1 = right.dimension(1);
-  const int output_dim0 =
-      transpose_output ? output->dimension(1) : output->dimension(0);
-  const int output_dim1 =
-      transpose_output ? output->dimension(0) : output->dimension(1);
   CHECK_EQ(left_dim1, right_dim0);
-  CHECK_EQ(left_dim0, output_dim0);
-  CHECK_EQ(right_dim1, output_dim1);
+  CHECK_EQ(left_dim0,
+           (transpose_output ? output->dimension(1) : output->dimension(0)));
+  CHECK_EQ(right_dim1,
+           (transpose_output ? output->dimension(0) : output->dimension(1)));
   if (left_dim0 < 32 || left_dim1 < 32 || right_dim1 < 32) {
     // Causes problems in libxsmm
     SparseMatMul<TL, TR>::Compute(
@@ -1614,51 +1475,42 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
   // Convert the left matrix to compressed sparse row (CSR) format
   ptrdiff_t total_num_creation_blocks =
       libxsmm_spmdm_get_num_createSparseSlice_blocks(&entry->handle);
-  ptrdiff_t total_num_mult_blocks =
-      libxsmm_spmdm_get_num_compute_blocks(&entry->handle);
-  bool use_libxsmm =
-      !(total_num_creation_blocks + total_num_mult_blocks < num_threads &&
-        !transpose_left && !transpose_output);
-  if (!use_libxsmm) {
-    // Avoid some performance issues in libxsmm (FIXME)
-    cache->return_cache_entry(std::move(entry));
-    SparseMatMul<TL, TR>::Compute(
-        nullptr /* Assumes no cached data for fallback */, left, right,
-        transpose_left, thread_pool, transpose_output, output);
-    return;
-  }
   std::atomic<int> cur_create_block_number;
   cur_create_block_number.store(0);
-  do_on_all_threads(thread_pool, total_num_creation_blocks,
-                    [&](int i, int actual_num_threads) {
-                      PinnedToCurrentCPU pin;
-                      while (true) {
-                        int work_item = cur_create_block_number.fetch_add(1);
-                        if (work_item >= total_num_creation_blocks) break;
-                        wrapper_libxsmm_spmdm_createSparseSlice_generic_thread(
-                            empty_type_wrapper<TL>{}, &entry->handle,
-                            (transpose_left ? 'T' : 'N'), left_data,
-                            entry->output_csr, work_item, i,
-                            actual_num_threads);
-                      }
-                    });
-
+  do_on_all_threads(thread_pool, [&](int i) {
+    PinnedToCurrentCPU pin;
+    while (true) {
+      int work_item = cur_create_block_number.fetch_add(1);
+      if (work_item >= total_num_creation_blocks) break;
+      wrapper_libxsmm_spmdm_createSparseSlice_generic_thread(
+          empty_type_wrapper<TL>{}, &entry->handle,
+          (transpose_left ? 'T' : 'N'), left_data, entry->output_csr, work_item,
+          i, num_threads);
+    }
+  });
   // Do matrix-matrix multiplication
+  // TODO(jewillco): libxsmm doesn't support beta != 1 yet -- remove when
+  // release
+  // includes beta handling
+  memset(output_data, 0, left_dim0 * right_dim1 * sizeof(TR));
+  ptrdiff_t total_num_mult_blocks =
+      libxsmm_spmdm_get_num_compute_blocks(&entry->handle);
   std::atomic<int> cur_mult_block_number;
   cur_mult_block_number.store(0);
-  do_on_all_threads(
-      thread_pool, total_num_mult_blocks, [&](int i, int actual_num_threads) {
-        PinnedToCurrentCPU pin;
-        while (true) {
-          int work_item = cur_mult_block_number.fetch_add(1);
-          if (work_item >= total_num_mult_blocks) break;
-          wrapper_libxsmm_spmdm_compute_generic_thread(
-              empty_type_wrapper<TL>{}, &entry->handle,
-              (transpose_left ? 'T' : 'N'), 'N', entry->output_csr, right_data,
-              (transpose_output ? 'T' : 'N'), output_data, work_item, i,
-              actual_num_threads);
-        }
-      });
+  do_on_all_threads(thread_pool, [&](int i) {
+    PinnedToCurrentCPU pin;
+    while (true) {
+      int work_item = cur_mult_block_number.fetch_add(1);
+      if (work_item >= total_num_mult_blocks) break;
+      const TL alpha(1.0);  // Stored in a variable so we can get a pointer
+      const TL beta(0.0);   // Stored in a variable so we can get a pointer
+      wrapper_libxsmm_spmdm_compute_generic_thread(
+          empty_type_wrapper<TL>{}, &entry->handle,
+          (transpose_left ? 'T' : 'N'), 'N', &alpha, entry->output_csr,
+          right_data, (transpose_output ? 'T' : 'N'), &beta, output_data,
+          work_item, i, num_threads);
+    }
+  });
   // Put handle + CSR storage back into cache
   cache->return_cache_entry(std::move(entry));
 }
@@ -1804,17 +1656,15 @@ inline void SparseMatMul<TL, TR>::Compute(
                           SparseMatMulOp<TA, TB, LibxsmmSparseMatMul>);
 #endif
 
+REGISTER_SPARSE_MATMUL(bfloat16, bfloat16);
+
 REGISTER_SPARSE_MATMUL(float, bfloat16);
 
 REGISTER_SPARSE_MATMUL(bfloat16, float);
 
 #ifdef TENSORFLOW_USE_LIBXSMM
-REGISTER_SPARSE_MATMUL_LIBXSMM(bfloat16, bfloat16);
-
 REGISTER_SPARSE_MATMUL_LIBXSMM(float, float);
 #else
-REGISTER_SPARSE_MATMUL(bfloat16, bfloat16);
-
 REGISTER_SPARSE_MATMUL(float, float);
 #endif
 
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index bff6a0c9b3..61bd6593c3 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -255,13 +255,12 @@ EIGEN_STRONG_INLINE Packet8d pbroadcast_second<Packet8d>(const Packet8d& a_in) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pbroadcast_third<Packet8d>(const Packet8d& a_in) {
-  Packet2d a = _mm256_extractf128_pd(_mm512_castpd512_pd256(a_in), 1);
+  Packet2d a = _mm512_extractf32x4_ps(a_in, 1);
   return _mm512_broadcastsd_pd(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pbroadcast_fourth<Packet8d>(const Packet8d& a_in) {
-  Packet2d a =
-      _mm_permute_pd(_mm256_extractf128_pd(_mm512_castpd512_pd256(a_in), 1), 3);
+  Packet2d a = _mm_permute_pd(_mm512_extractf32x4_ps(a_in, 1), 3);
   return _mm512_broadcastsd_pd(a);
 }
 template <>
@@ -418,17 +417,14 @@ EIGEN_STRONG_INLINE Packet8f pbroadcast_fourth<Packet8f>(const Packet8f& a) {
 
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_l(const Packet16f& from) {
-  return _mm512_castsi512_ps(_mm512_slli_epi32(
-      _mm512_cvtepu16_epi32(_mm512_castsi512_si256(_mm512_castps_si512(from))),
-      16));
+  return _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(from)),
+                           16);
 }
 
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet16f pexpand_bf16_u(const Packet16f& from) {
-  return _mm512_castsi512_ps(
-      _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_castpd_si256(
-                            _mm512_extractf64x4_pd(_mm512_castps_pd(from), 1))),
-                        16));
+  return _mm512_slli_epi32(
+      _mm512_cvtepu16_epi32(_mm512_extractf64x4_pd(from, 1)), 16);
 }
 
 #endif

From c2dea53ae3c0588e49c21b0c735af98810e0c3ab Mon Sep 17 00:00:00 2001
From: Alexander Heinecke <alexander.heinecke@intel.com>
Date: Thu, 16 Mar 2017 16:08:30 -0700
Subject: [PATCH 10/15] added initial padding support for backward (#21)

---
 .../core/kernels/conv_grad_input_ops.cc       | 67 +++++++++++++------
 1 file changed, 46 insertions(+), 21 deletions(-)

diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 8cd7f5db7e..7e0912b4db 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -236,22 +236,30 @@ class Conv2DFastBackpropInputOp : public OpKernel {
                    context->allocate_output(0, input_shape, &in_backprop));
 
 #if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
-/*
-    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(input_rows, filter_rows, stride_rows,
-                                         padding_, &out_rows, &pad_rows));
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
-                                         padding_, &out_cols, &pad_cols));
-*/
-    if (LaunchXsmmBackwardInputConvolution<Device, T>()(
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding_,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding_,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+
+    if ( pad_left == pad_right && pad_top == pad_bottom ) {
+      if (LaunchXsmmBackwardInputConvolution<Device, T>()(
             context, context->eigen_device<Device>(),
             in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
             out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
-            dims.spatial_dims[1].input_size, dims.spatial_dims[0].stride,
-            dims.spatial_dims[1].stride, data_format_)) {
-      return;
+            dims.spatial_dims[1].input_size, (int)dims.spatial_dims[0].stride,
+            (int)dims.spatial_dims[1].stride, (int)pad_top, (int)pad_left, data_format_)) {
+        return;
+      }
     }
 #endif
 
@@ -318,21 +326,38 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input_shape, &in_backprop));
 
+    // TODO(andydavis) Consider moving code shared with
+    // Conv2DCustomBackpropFilterOp into a shared helper function.
 #if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
-    if (LaunchXsmmBackwardInputConvolution<Device, T>()(
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding_,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding_,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+
+    if ( pad_left == pad_right && pad_top == pad_bottom ) {
+      if (LaunchXsmmBackwardInputConvolution<Device, T>()(
             context, context->eigen_device<Device>(),
             in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
             out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
-            dims.spatial_dims[1].input_size, dims.spatial_dims[0].stride,
-            dims.spatial_dims[1].stride, data_format_)) {
-      return;
+            dims.spatial_dims[1].input_size, (int)dims.spatial_dims[0].stride,
+            (int)dims.spatial_dims[1].stride, (int)pad_top, (int)pad_left, data_format_)) {
+        return;
+      }
     }
-#endif
-
-    // TODO(andydavis) Consider moving code shared with
-    // Conv2DCustomBackpropFilterOp into a shared helper function.
+#else
     int64 pad_top, pad_bottom;
     int64 pad_left, pad_right;
+#endif
     OP_REQUIRES_OK(
         context,
         GetWindowedOutputSizeVerbose(

From 4f5ff50957bde0aabc2d528f4fb3bc09a76f39bc Mon Sep 17 00:00:00 2001
From: taknevski <taknevski@users.noreply.github.com>
Date: Mon, 20 Mar 2017 18:46:44 -0700
Subject: [PATCH 11/15] weight update (#23)

---
 .../core/kernels/conv_grad_filter_ops.cc      | 118 ++++++++++++++++++
 tensorflow/core/kernels/xsmm_conv2d.cc        |  39 +++---
 2 files changed, 141 insertions(+), 16 deletions(-)

diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 7d67fb3a7d..a59277f18e 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -91,6 +91,78 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
+#ifdef TENSORFLOW_USE_LIBXSMM
+template <typename Device, class T>
+struct LaunchXsmmBackwardFilter {
+  bool operator()(OpKernelContext* context, const Device& d,
+                  typename TTypes<T, 4>::ConstTensor input_backward,
+                  typename TTypes<T, 4>::Tensor kernel,
+                  typename TTypes<T, 4>::ConstTensor output_backward,
+                  int input_rows, int input_cols, int row_stride,
+                  int col_stride, int pad_h, int pad_w, TensorFormat data_format) const {
+    return false;
+  }
+};
+ 
+template <>
+struct LaunchXsmmBackwardFilter<CPUDevice, float> {
+  bool operator()(OpKernelContext* context, const CPUDevice& d,
+                  typename TTypes<float, 4>::ConstTensor input,
+                  typename TTypes<float, 4>::Tensor filter,
+                  typename TTypes<float, 4>::ConstTensor output,
+                  int input_rows, int input_cols, int row_stride,
+                  int col_stride,int pad_h, int pad_w,  TensorFormat data_format) const {
+    auto batch = input.dimension(0);
+    auto in_depth = input.dimension(3);
+    auto out_depth = output.dimension(3);
+    auto filter_rows = filter.dimension(0);
+    auto filter_cols = filter.dimension(1);
+ 
+    auto num_threads =
+        context->device()->tensorflow_cpu_worker_threads()->num_threads;
+    // See libxsmm_dnn.h for this struct definition.
+    libxsmm_dnn_conv_desc desc;
+    desc.N = batch;
+    desc.C = in_depth;
+    desc.H = input_rows;
+    desc.W = input_cols;
+    desc.K = out_depth;
+    desc.R = filter_rows;
+    desc.S = filter_cols;
+    desc.u = row_stride;
+    desc.v = col_stride;
+    desc.pad_h = pad_h;
+    desc.pad_w = pad_w;
+    desc.pad_h_in = 0;  // pad_rows;  // ignored by libxsmm for now.
+    desc.pad_w_in = 0;  // pad_cols;  // ignored by libxsmm for now.
+    desc.pad_h_out = 0;
+    desc.pad_w_out = 0;
+    desc.threads = num_threads;
+    desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
+    desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
+    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
+    desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
+    desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+    desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
+ 
+ 
+    if (!CanUseXsmmConv2D(desc, data_format)) {
+      return false;
+    }
+ 
+ 
+    auto input_ptr = input.data();
+    auto filter_ptr = filter.data();
+    auto output_ptr = output.data();
+    bool success = functor::XsmmBkwFilterConv2D<CPUDevice, float>()(
+        context, desc, input_ptr, filter_ptr, output_ptr);
+    return success;
+  }
+};
+#endif
+
+
+
 template <typename Device, class T>
 class Conv2DFastBackpropFilterOp : public OpKernel {
  public:
@@ -138,6 +210,39 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, filter_shape, &filter_backprop));
 
+    #if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
+ 
+    int64 pad_top, pad_bottom;
+    int64 pad_left, pad_right;
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+            dims.spatial_dims[0].stride, padding_,
+            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(
+        context,
+        GetWindowedOutputSizeVerbose(
+            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+            dims.spatial_dims[1].stride, padding_,
+            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+ 
+    if ( pad_left == pad_right && pad_top == pad_bottom ) {
+ 
+      if (LaunchXsmmBackwardFilter<Device, T>()(
+            context, context->eigen_device<Device>(),
+            input.tensor<T, 4>(),filter_backprop->tensor<T, 4>(),
+            out_backprop.tensor<T, 4>(),  dims.spatial_dims[0].input_size,
+            dims.spatial_dims[1].input_size, (int)dims.spatial_dims[0].stride,
+            (int)dims.spatial_dims[1].stride,(int)pad_top, (int)pad_left, data_format_)) {
+      return;
+      }
+    }
+    #endif
+
+
+
+
     functor::SpatialConvolutionBackwardKernel<Device, T>()(
         context->eigen_device<Device>(), filter_backprop->tensor<T, 4>(),
         input.tensor<T, 4>(), out_backprop.tensor<T, 4>(),
@@ -216,6 +321,19 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
             dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
             dims.spatial_dims[1].stride, padding_,
             &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
+  #if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
+    if ( pad_left == pad_right && pad_top == pad_bottom ) {
+ 
+      if (LaunchXsmmBackwardFilter<Device, T>()(
+            context, context->eigen_device<Device>(),
+            input.tensor<T, 4>(),filter_backprop->tensor<T, 4>(),
+            out_backprop.tensor<T, 4>(),  dims.spatial_dims[0].input_size,
+            dims.spatial_dims[1].input_size, (int)dims.spatial_dims[0].stride,
+            (int)dims.spatial_dims[1].stride,(int)pad_top, (int)pad_left, data_format_)) {
+      return;
+      }
+    }
+  #endif
 
     // The total dimension size of each kernel.
     const int filter_total_size = dims.spatial_dims[0].filter_size *
diff --git a/tensorflow/core/kernels/xsmm_conv2d.cc b/tensorflow/core/kernels/xsmm_conv2d.cc
index 034a49cbb5..157616d1bc 100644
--- a/tensorflow/core/kernels/xsmm_conv2d.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@@ -253,35 +253,42 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
 
   int num_threads = worker_threads->num_threads;
 
-#if 1  
-  if(blocksofm > num_threads){
-    int work = blocksofm;
-    BlockingCounter count(num_threads);
-    for (int i = 0; i < num_threads; ++i) {
+#if 1
+  if(kind ==  LIBXSMM_DNN_COMPUTE_KIND_FWD || kind ==  LIBXSMM_DNN_COMPUTE_KIND_BWD){
+    if(blocksofm > num_threads){
+      int work = blocksofm;
+      BlockingCounter count(num_threads);
+      for (int i = 0; i < num_threads; ++i) {
         worker_threads->workers->Schedule([=, &count]() {
         int start = work/num_threads*i;
-        int end =  (start + work/num_threads) > work ? work: start + work/num_threads;  
+        int end =  (start + work/num_threads) > work ? work: start + work/num_threads;
         copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S,desc.C, desc.K,blocksifm,blocksofm,ifmblock,ofmblock,start, end);
         count.DecrementCount();
         });
+      }
+      count.Wait();
     }
-    count.Wait();
-  }
-  else{
-
-    int work = blocksofm;
-    int num_threads = work;
-    
-    BlockingCounter count(num_threads);
-    for (int i = 0; i < num_threads; ++i) {
+    else{
+ 
+      int work = blocksofm;
+      int num_threads = work;
+ 
+      BlockingCounter count(num_threads);
+      for (int i = 0; i < num_threads; ++i) {
         worker_threads->workers->Schedule([=, &count]() {
         int start = i;
         int end =  i+1;
         copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S,desc.C, desc.K,blocksifm,blocksofm,ifmblock,ofmblock, start, end);
         count.DecrementCount();
         });
+      }
+      count.Wait();
     }
-    count.Wait();
+  }
+  //Added: for weight update
+  else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD){
+    libxsmm_filter = libxsmm_dnn_link_filter(libxsmm_handle, LIBXSMM_DNN_FILTER, filter, LIBXSMM_DNN_TENSOR_FORMAT_RSCK_PTR, &status);
+    chk_libxsmm_err(status, "Link filter");//weight update is in RSCK as filter should be returned in RSCK format
   }
 #else
   memset( native_filter, 0, blocksofm*blocksifm*desc.R*desc.S*ifmblock*ofmblock*sizeof(float));

From 654263e9144dd1ac0f1b9205e819df2c4f3505f2 Mon Sep 17 00:00:00 2001
From: taknevski <taknevski@users.noreply.github.com>
Date: Tue, 21 Mar 2017 12:21:53 -0700
Subject: [PATCH 12/15] Bug fix involving weight update (#24)

* weight update

* bug fix
---
 tensorflow/core/kernels/xsmm_conv2d.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/xsmm_conv2d.cc b/tensorflow/core/kernels/xsmm_conv2d.cc
index 157616d1bc..d3a29c2e3e 100644
--- a/tensorflow/core/kernels/xsmm_conv2d.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@@ -304,10 +304,11 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   libxsmm_output = libxsmm_dnn_link_buffer(
       libxsmm_handle, LIBXSMM_DNN_OUTPUT, output, LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
   chk_libxsmm_err(status, "Link output buffer");
+  if(kind == LIBXSMM_DNN_COMPUTE_KIND_FWD || kind == LIBXSMM_DNN_COMPUTE_KIND_BWD){
   libxsmm_filter = libxsmm_dnn_link_filter(
       libxsmm_handle, LIBXSMM_DNN_FILTER, native_filter, LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
   chk_libxsmm_err(status, "Link filter");
-
+  }
   if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD) {
     chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_output), "Zero output");
 

From f57b47c77f9333c81a6ca29b540b8479b5634496 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Thu, 30 Mar 2017 10:12:52 -0700
Subject: [PATCH 13/15] Upgraded to libxsmm 1.8

---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 652ee13760..818a7e64bd 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -89,11 +89,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "libxsmm_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/hfp/libxsmm/archive/940a1dc6173fef49881a69b9249e4bbaf3380d09.tar.gz",
-          "https://github.com/hfp/libxsmm/archive/940a1dc6173fef49881a69b9249e4bbaf3380d09.tar.gz",
+          "http://bazel-mirror.storage.googleapis.com/github.com/hfp/libxsmm/archive/1.8.tar.gz",
+          "https://github.com/hfp/libxsmm/archive/1.8.tar.gz",
       ],
-      sha256 = "dced27a393abf169594ff619dde6879c3220c39fdd83e4cd228553d4f8cd2a92",
-      strip_prefix = "libxsmm-940a1dc6173fef49881a69b9249e4bbaf3380d09",
+      sha256 = "0330201afb5525d0950ec861fec9dd75eb40a03845ebe03d2c635cf8bfc14fea",
+      strip_prefix = "libxsmm-1.8",
       build_file = str(Label("//third_party:libxsmm.BUILD")),
   )
 

From 3047716835d502d6302e5fed228a1a90882312d3 Mon Sep 17 00:00:00 2001
From: taknevski <avenkat@ssr-nfs1.sc.intel.com>
Date: Fri, 31 Mar 2017 16:06:44 -0700
Subject: [PATCH 14/15] adding python3 support

---
 third_party/libxsmm.BUILD | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/third_party/libxsmm.BUILD b/third_party/libxsmm.BUILD
index 32f48d9fd2..9559f8be80 100644
--- a/third_party/libxsmm.BUILD
+++ b/third_party/libxsmm.BUILD
@@ -117,24 +117,28 @@ cc_library(
 
 py_library(
     name = "libxsmm_scripts",
+    srcs_version = "PY2AND3",
     srcs = glob(["scripts/*.py"]),
     data = ["version.txt"],
 )
 
 py_binary(
     name = "libxsmm_interface",
+    srcs_version = "PY2AND3",
     srcs = ["scripts/libxsmm_interface.py"],
     deps = [":libxsmm_scripts"],
 )
 
 py_binary(
     name = "libxsmm_config",
+    srcs_version = "PY2AND3",
     srcs = ["scripts/libxsmm_config.py"],
     deps = [":libxsmm_scripts"],
 )
 
 py_binary(
     name = "libxsmm_dispatch",
+    srcs_version = "PY2AND3",
     srcs = ["scripts/libxsmm_dispatch.py"],
     deps = [":libxsmm_scripts"],
 )

From 0ccfe7f54f2623282125e7e6bc133ed6d8be0b54 Mon Sep 17 00:00:00 2001
From: taknevski <avenkat@ssr-nfs1.sc.intel.com>
Date: Fri, 31 Mar 2017 16:07:26 -0700
Subject: [PATCH 15/15] libxsmm cahnges for caching handles

---
 .../core/kernels/conv_grad_filter_ops.cc      |  5 +-
 tensorflow/core/kernels/xsmm_conv2d.cc        | 61 +++++++++++--------
 2 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index a59277f18e..12e61f56e2 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -142,14 +142,15 @@ struct LaunchXsmmBackwardFilter<CPUDevice, float> {
     desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
     desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
-    desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
+    desc.options = LIBXSMM_DNN_CONV_OPTION_WU_EXT_FILTER_REDUCE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
  
  
     if (!CanUseXsmmConv2D(desc, data_format)) {
       return false;
     }
- 
+    if(desc.u > 1 || desc.v > 1)
+      return false;
  
     auto input_ptr = input.data();
     auto filter_ptr = filter.data();
diff --git a/tensorflow/core/kernels/xsmm_conv2d.cc b/tensorflow/core/kernels/xsmm_conv2d.cc
index d3a29c2e3e..c9c53edefc 100644
--- a/tensorflow/core/kernels/xsmm_conv2d.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@@ -24,7 +24,7 @@ void dummy_xsmm_conv2d_ensure_file_is_not_empty(void);
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/kernels/xsmm_conv2d.h"
-
+#include <mutex>
 #include <stdlib.h>
 #include <cstring>
 #if 0
@@ -109,7 +109,6 @@ LIBXSMM_INLINE void copy_RSCK_to_custom(const float* rsck, float *kcrs, int R, i
   }
 }
 
- 
 
 class libxsmm_dnn_conv_desc_wrap{
   public:
@@ -128,34 +127,44 @@ class libxsmm_dnn_conv_desc_wrap{
               d.u == w.d.u &&
               d.v == w.d.v &&
               d.pad_h == w.d.pad_h &&
-              d.pad_w == w.d.pad_w
-            );
+              d.pad_w == w.d.pad_w &&
+              d.pad_h_in == w.d.pad_h_in &&
+              d.pad_w_in == w.d.pad_w_in &&
+              d.pad_h_out == w.d.pad_h_out &&
+              d.pad_w_out == w.d.pad_w_out &&
+              d.threads == w.d.threads &&
+              d.algo == d.algo &&
+              d.buffer_format == w.d.buffer_format &&
+              d.filter_format == w.d.filter_format &&
+              d.fuse_ops == w.d.fuse_ops &&
+              d.options == w.d.options &&
+              d.datatype == w.d.datatype);
     }
 };
- 
- 
-struct HashFunction{
-  std::size_t operator()(const libxsmm_dnn_conv_desc_wrap & w) const{
-   
-    
 
 
+struct HashFunction{
+  std::size_t operator()(const libxsmm_dnn_conv_desc_wrap & w) const{
+ 
+ 
+ 
+ 
     //unsigned char ptr[sizeof(&w.d)];
-
-    
+ 
+ 
     //memcpy(ptr, (unsigned char *)&w.d, sizeof(&w.d))
-                                       
-
+ 
+ 
     //
-    /*
+ 
     std::ostringstream N,C,H,W,K,R,S,u,v,padh,padw;
  
     N << w.d.N; C << w.d.C;
     H << w.d.H; W << w.d.W;
     K << w.d.K; R << w.d.R;
     S << w.d.S; u << w.d.u;
-    v << w.d.v; padh << w.d.pad_h_in;
-    padw << w.d.pad_w_in;
+    v << w.d.v; padh << w.d.pad_h;
+    padw << w.d.pad_w;
  
  
     std::string out_ =   N.str() + C.str()\
@@ -164,16 +173,17 @@ struct HashFunction{
                        + S.str() + u.str()\
                        + v.str() + padh.str()\
                        + padw.str();
-    //
-    //
-    */
-    return ( std::hash<unsigned long long>()((unsigned long long)&(w.d)));
+    return ( std::hash<std::string>()(out_));
+    //return ( std::hash<unsigned long long>()((unsigned long long)&(w.d)));
   }
 };
  
+ 
 class handles{
   public:
     libxsmm_dnn_layer* find( const libxsmm_dnn_conv_desc_wrap &w) {
+ 
+      std::unique_lock<std::mutex> lock(mutex_);
       std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction>::iterator i = libxsmm_handles.find(w);
       if (i == libxsmm_handles.end()){
         libxsmm_dnn_err_t status;
@@ -192,11 +202,12 @@ class handles{
                     "Destroy handle");
     }
   private:
- 
-    std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction> libxsmm_handles;
+     std::mutex mutex_;
+     std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction> libxsmm_handles;
  
 };
 
+
 static handles libxsmm_handles;
 
 //#define LIBXSMM_DETAILED_TIMING
@@ -226,8 +237,8 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   
   status = libxsmm_dnn_get_codegen_success(libxsmm_handle, kind);
   if (status == LIBXSMM_DNN_WARN_FALLBACK) {
-    chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
-                    "Destroy handle");
+    //chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
+    //                "Destroy handle");
     return false;  // Use non-libxsmm code
   }
   chk_libxsmm_err(status, "Check codegen status");