- Optimal pooling parallelized

- Cosmetic changes to assembly part of pooling. - Added xbyak files - Removed critical section from optimized pooling code - Removed redundant relu code - Cosmetic fixes - Optimized VML powx (for openblas and atlas) - Upaded readme with current results (after VML optimizations) - Fix to performance results - coding standard fixes Conflicts: README.md - Fix warning for non-OpenMP build - Halfbaked results and readme - some additional characters - Performance results for XeonE5 added
mingxiaoh · Jan 8, 2016 · 5a5ef29 · 5a5ef29
1 parent 9dbf7a3
commit 5a5ef29
Show file tree

Hide file tree

Showing 12 changed files with 4,582 additions and 662 deletions.
diff --git a/README.md b/README.md
@@ -22,22 +22,23 @@ Framework development discussions and thorough bug reports are collected on [Iss
 
 Happy brewing!
 
-# Intel Caffe (OpenMP branch)
+# Intel Caffe
+This fork is dedicated to improving Caffe performance when running on CPU (in particular Xeon servers)
 
 ## Performance Results :
-Time measures are: average Forward-Backward as stated by *caffe time*. *speedup* is (master branch measure / openmp-conv-relu branch measure)
+Time measures are: average Forward-Backward as stated by *caffe time*. *speedup* is (bvlc-caffe-master branch measure) / (intelcaffe-master branch measure)
 
 #### Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (MKL 11.3, GCC 4.8.3):
-Branch | googlenet(speedup: 5.5) | caffenet (speedup: 5.9) | alexnet(speedup: 5.5) | ciphar10-sigmoid-bn(speedup: 7.5) 
+Branch | googlenet(speedup: 6.5) | caffenet (speedup: 6.4) | alexnet(speedup: 6.2) | cifar10-sigmoid-bn(speedup: 9.5) 
 ----------|-----------------------|-------------------------------|-----------------------------|---------------------
-openmp (using OMP_NUM_THREADS=36)| 813ms|1369ms|1547ms|43ms
-master |4438ms                    |8164ms|8644ms |323ms
+intelcaffe-master |682ms|1276ms|1387ms|34ms
+bvlc-caffe-master |4438ms|8164ms|8644ms |323ms
 
 #### Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (OpenBLAS 0.2.14, GCC 4.8.3):
-Branch | googlenet(speedup: 2.4) | caffenet (speedup: 3.7) | alexnet(speedup: 1.1)| ciphar10-sigmoid-bn(speedup: 6.6) 
+Branch | googlenet(speedup: 17.0) | caffenet (speedup: 8.7) | alexnet(speedup: 15.4)| ciphar10-sigmoid-bn(speedup: 8.4) 
 ----------|-----------------------|-------------------------------|----------|-------------------
-openmp (using OMP_NUM_THREADS=36)| 7033ms|7076ms |57980ms|81ms  
-master |16848ms 	|26130ms 	|62091ms|538ms
+intelcaffe-openmp |1169ms|3088ms|4628ms|63ms  
+bvlc-caffe-master |19767|26993ms|71152ms|529ms
 
 So there is significant speedup, that depends on how many CPU cores the platform does have. Tests were made using MKL(it is available free of charge now) and OpenBLAS.
 
@@ -48,14 +49,13 @@ OS kernel for running two OpenMP threads on the same phyisical core (eg. using H
 ### Building:
 Build as usual, either from makefile or cmake. Both build systems will detect if openmp is available for compiler of your choice and use it
 
-
 ### Running:
 It is best NOT to use Hyperthreading . So either disable it in BIOS or limit OpenMP threads number by using OMP_NUM_THREADS env variable. If not sure how to set OMP_NUM_THREADS  and you cannot disable HT in BIOS, then do not do it, you should still observe performance gain , but not that
 significant as when not relying on HT.
 
 ##### Example of running:
 ###### Intel(R) Xeon(R) E5-2699 v3 @ 2.30GHz, two sockets, 18 cpu cores in each socket
-*OMP_NUM_THREADS=36 ./build/tools/caffe time -iterations 20  --model=models/bvlc_googlenet/train_val.prototxt*  
+* GOMP_CPU_AFFINITY="0-35" OMP_PROC_BIND=false OMP_NUM_THREADS=36 ./build/tools/caffe time -iterations 50  --model=models/bvlc_googlenet/train_val.prototxt*  
 
 ##### Notes:
 To check if you have HT enabled:

diff --git a/include/caffe/layers/pooling_layer_impl.hpp b/include/caffe/layers/pooling_layer_impl.hpp
@@ -1,17 +1,16 @@
 #ifndef CAFFE_CODE_GENERATORS_POOLING_H_
 #define CAFFE_CODE_GENERATORS_POOLING_H_
 
-#include "caffe/proto/caffe.pb.h"
 #include <vector>
+#include "caffe/proto/caffe.pb.h"
 
 #if defined __x86_64__ || defined _M_X64
 # define XBYAK_NO_OP_NAMES
 # define XBYAK_USE_MMAP_ALLOCATOR
 # include "../xbyak/xbyak_util.h"
 #endif
 
-namespace caffe
-{
+namespace caffe {
 // Declarations of CodeGenerator classes.
 
 template <typename Dtype>
@@ -21,31 +20,31 @@ template <typename Dtype>
 class Blob;
 
 template <typename Dtype>
-class PoolingCodeGeneratorForward 
+class PoolingCodeGeneratorForward
 #if defined __x86_64__ || defined _M_X64
   : public ::Xbyak::CodeGenerator
 #endif
 {
-public:
+ public:
   PoolingCodeGeneratorForward();
   ~PoolingCodeGeneratorForward();
 
   typedef void (Callback_t)(
-    const Dtype* bottom_data, 
-    Dtype* top_data, 
+    const Dtype* bottom_data,
+    Dtype* top_data,
     int top_count,
     int batch_start,
     int batch_end,
-    Dtype* top_mask,
+    void* mask,
     PoolingLayer<Dtype>* layer,
     bool use_top_mask);
 
   Callback_t* Get_callback(
-    PoolingLayer<Dtype>* layer, 
-    Blob<Dtype>* top, 
+    PoolingLayer<Dtype>* layer,
+    Blob<Dtype>* top,
     bool use_top_mask);
 
-private:
+ private:
   void Create_callback(PoolingLayer<Dtype>* layer);
 
   static Callback_t Naive;
@@ -61,28 +60,28 @@ class PoolingCodeGeneratorBackward
   : public ::Xbyak::CodeGenerator
 #endif
 {
-public:
+ public:
   PoolingCodeGeneratorBackward();
   ~PoolingCodeGeneratorBackward();
 
   typedef void (Callback_t)(
-    const Dtype* top_diff, 
-    Dtype* bottom_diff, 
+    const Dtype* top_diff,
+    Dtype* bottom_diff,
     int batch_start,
     int batch_end,
     bool use_top_mask,
-    const Dtype* top_mask,
+    const void* mask,
     PoolingLayer<Dtype>* layer);
 
   Callback_t* Get_callback(PoolingLayer<Dtype>* layer, Blob<Dtype>* top);
 
-private:
+ private:
   void Create_callback(PoolingLayer<Dtype>* layer);
 
   static Callback_t Naive;
   Callback_t* Callback;
   std::vector<int> layer_output_shape_signature;
 };
-}
+}  // namespace caffe
 
-#endif // CAFFE_CODE_GENERATORS_POOLING_H_
+#endif  // CAFFE_CODE_GENERATORS_POOLING_H_
diff --git a/include/caffe/layers/relu_layer.hpp b/include/caffe/layers/relu_layer.hpp
@@ -8,7 +8,6 @@
 #include "caffe/proto/caffe.pb.h"
 
 #include "caffe/layers/neuron_layer.hpp"
-#include "caffe/layers/relu_layer_impl.hpp"
 
 namespace caffe {
 
@@ -18,10 +17,6 @@ namespace caffe {
  */
 template <typename Dtype>
 class ReLULayer : public NeuronLayer<Dtype> {
-  // Private code generators.
-  ReLUCodeGeneratorForward<Dtype> Forward_code_generator;
-  ReLUCodeGeneratorBackward<Dtype> Backward_code_generator;
-
  public:
   /**
    * @param param provides ReLUParameter relu_param,

diff --git a/include/caffe/layers/relu_layer_impl.hpp b/include/caffe/layers/relu_layer_impl.hpp
diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
@@ -14,6 +14,19 @@ extern "C" {
 
 // Functions that caffe uses but are not present if MKL is not linked.
 
+template <typename Dtype>
+inline void apply(int n,
+                  const Dtype src[],
+                  Dtype b,
+                  Dtype target[],
+                  Dtype  (*func)(Dtype, Dtype)) {
+#ifdef _OPENMP
+    #pragma omp parallel for
+#endif
+    for (int i = 0; i < n; ++i) {
+        target[i] = func(src[i], b);
+    }
+}
 // A simple way to define the vsl unary functions. The operation should
 // be in the form e.g. y[i] = sqrt(a[i])
 #define DEFINE_VSL_UNARY_FUNC(name, operation) \
@@ -42,7 +55,7 @@ DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]));
   template<typename Dtype> \
   void v##name(const int n, const Dtype* a, const Dtype b, Dtype* y) { \
     CHECK_GT(n, 0); CHECK(a); CHECK(y); \
-    for (int i = 0; i < n; ++i) { operation; } \
+    apply(n, a, b, y, operation); \
   } \
   inline void vs##name( \
     const int n, const float* a, const float b, float* y) { \
@@ -53,7 +66,7 @@ DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]));
     v##name<double>(n, a, b, y); \
   }
 
-DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b));
+DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, std::pow);  // y[i] = pow(a[i], b));
 
 // A simple way to define the vsl binary functions. The operation should
 // be in the form e.g. y[i] = a[i] + b[i]

diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
@@ -130,22 +130,29 @@ void PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   // We'll output the mask to top[1] if it's of size >1.
   const bool use_top_mask = top.size() > 1;
 
+  typename PoolingCodeGeneratorForward<Dtype>::Callback_t* generator_func =
+           Forward_code_generator.Get_callback(this, top[0], use_top_mask);
+  // We are getting top_mask here as mutable_cpu_data is not thread safe
+  // and doing it inside parallel region creates of risk of race condition
+  void* mask = NULL;
+  if (this->layer_param_.pooling_param().pool() ==
+      PoolingParameter_PoolMethod_MAX ) {
+    mask = (use_top_mask) ? static_cast<void*>(top[1]->mutable_cpu_data()) :
+                            static_cast<void*>(max_idx_.mutable_cpu_data());
+  }
+
 #ifdef _OPENMP
   #pragma omp parallel for
 #endif
-  for(int image = 0; image < bottom[0]->num(); ++image)
-    Forward_code_generator.Get_callback(
-      this, 
-      top[0],
-      use_top_mask)(
-        bottom_data,
-        top_data,
-        top_count,
-        image,
-        image+1,
-        (use_top_mask) ? top[1]->mutable_cpu_data() : NULL,
-        this,
-        use_top_mask);
+  for (int image = 0; image < bottom[0]->num(); ++image)
+    generator_func(bottom_data,
+                   top_data,
+                   top_count,
+                   image,
+                   image+1,
+                   mask,
+                   this,
+                   use_top_mask);
 }
 
 template <typename Dtype>
@@ -160,18 +167,29 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   // We'll output the mask to top[1] if it's of size >1.
   const bool use_top_mask = top.size() > 1;
 
+  typename PoolingCodeGeneratorBackward<Dtype>::Callback_t* generator_func =
+                          Backward_code_generator.Get_callback(this, top[0]);
+
+  // We are getting top_mask here as mutable_cpu_data is not thread safe
+  // and doing it inside parallel region creates of risk of race condition
+  void* mask = NULL;
+  if (this->layer_param_.pooling_param().pool() ==
+      PoolingParameter_PoolMethod_MAX ) {
+    mask = (use_top_mask) ? static_cast<void*>(top[1]->mutable_cpu_data()) :
+                            static_cast<void*>(max_idx_.mutable_cpu_data());
+  }
+
 #ifdef _OPENMP
   #pragma omp parallel for
 #endif
-  for(int image = 0; image < bottom[0]->num(); ++image)
-    Backward_code_generator.Get_callback(this, top[0])(
-      top_diff,
-      bottom_diff,
-      image,
-      image+1,
-      use_top_mask,
-      (use_top_mask) ? top[1]->cpu_data() : NULL,
-      this);
+  for (int image = 0; image < bottom[0]->num(); ++image)
+    generator_func(top_diff,
+                   bottom_diff,
+                   image,
+                   image+1,
+                   use_top_mask,
+                   mask,
+                   this);
 }