Fast tensor ops using MKL

- Accelerates some common tensor ops in Intel CPU inference for float32, especially for fully connected networks - Can be turned on/off by cntk.cntk_py.enable_cpueval_optimization()/cntk.cntk_py.disable_cpueval_optimization()
weiplanet · Feb 9, 2018 · 19719a6 · 19719a6
1 parent 08bcbdc
commit 19719a6
Show file tree

Hide file tree

Showing 14 changed files with 343 additions and 7 deletions.
diff --git a/Documentation/current_iteration.md b/Documentation/current_iteration.md
@@ -13,4 +13,8 @@ C.debugging.enable_profiler() # optional
 #<trainer|evaluator|function> executions
 <trainer|evaluator|function>.print_node_timing()
 C.debugging.stop_profiler()
-```
+```
+
+## CPU inference performance improvements using MKL
+- Accelerates some common tensor ops in Intel CPU inference for float32, especially for fully connected networks
+- Can be turned on/off by cntk.cntk_py.enable_cpueval_optimization()/cntk.cntk_py.disable_cpueval_optimization()
diff --git a/Examples/Image/Classification/MLP/Python/SimpleMNIST.py b/Examples/Image/Classification/MLP/Python/SimpleMNIST.py
@@ -112,6 +112,7 @@ def simple_mnist(tensorboard_logdir=None):
     C.debugging.start_profiler()
     C.debugging.enable_profiler()
     C.debugging.set_node_timing(True)
+    #C.cntk_py.disable_cpueval_optimization() # uncomment this to check CPU eval perf without optimization
 
     test_minibatch_size = 1024
     num_samples = 10000

diff --git a/Makefile b/Makefile
@@ -370,6 +370,7 @@ MATH_SRC =\
 	$(SOURCEDIR)/Math/CPUMatrixTensorFloat.cpp \
 	$(SOURCEDIR)/Math/CPUMatrixTensorDouble.cpp \
 	$(SOURCEDIR)/Math/CPUMatrixTensorHalf.cpp \
+	$(SOURCEDIR)/Math/CPUMatrixTensorSpecial.cpp \
 	$(SOURCEDIR)/Math/CPURNGHandle.cpp \
 	$(SOURCEDIR)/Math/CPUSparseMatrix.cpp \
 	$(SOURCEDIR)/Math/ConvolutionEngine.cpp \

diff --git a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@@ -319,6 +319,9 @@ namespace CNTK
         CNTK_API void EnableNodeTiming();
         CNTK_API void DisableNodeTimeing();
 
+        CNTK_API void EnableCPUEvalOptimization();
+        CNTK_API void DisableCPUEvalOptimization();
+
         CNTK_API bool AreEquivalent(const ::CNTK::FunctionPtr& f1, const ::CNTK::FunctionPtr& f2);
         CNTK_API bool AreEquivalent(const ::CNTK::Variable& v1, const ::CNTK::Variable& v2, bool allowParameterAndConstantsEquivalence = false);
 

diff --git a/Source/CNTKv2LibraryDll/Common.cpp b/Source/CNTKv2LibraryDll/Common.cpp
@@ -220,6 +220,21 @@ namespace CNTK
             Microsoft::MSR::CNTK::Globals::SetNodeTiming(false);
         }
 
+        void EnableCPUEvalOptimization()
+        {
+            // optimization is only for float
+            int flags = Microsoft::MSR::CNTK::CPUMatrix<float>::GetOptimizationFlags();
+            flags |= Microsoft::MSR::CNTK::CPUMatrix<float>::OPT_EVAL_WITH_MKL;
+            Microsoft::MSR::CNTK::CPUMatrix<float>::SetOptimizationFlags(Microsoft::MSR::CNTK::CPUMatrix<float>::OPT_EVAL_WITH_MKL);
+        }
+
+        void DisableCPUEvalOptimization()
+        {
+            int flags = Microsoft::MSR::CNTK::CPUMatrix<float>::GetOptimizationFlags();
+            flags &= ~Microsoft::MSR::CNTK::CPUMatrix<float>::OPT_EVAL_WITH_MKL;
+            Microsoft::MSR::CNTK::CPUMatrix<float>::SetOptimizationFlags(flags);
+        }
+
         bool AreEquivalent(const Variable& var1, const Variable& var2, bool allowParameterAndConstantsEquivalence)
         {
             bool areDynamicAxesCompatible = (var1.DynamicAxes().size() == var2.DynamicAxes().size());

diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h
@@ -433,6 +433,13 @@ class MATH_API CPUMatrix : public BaseMatrix<ElemType>
     static int SetNumThreads(int numThreads);
     static int GetMaxNumThreads();
 
+    enum OptimizationFlag
+    {
+        OPT_EVAL_WITH_MKL = 1, // using Intel MKL functions for evaluation performance
+    };
+    static void SetOptimizationFlags(int flags);
+    static int  GetOptimizationFlags();
+
     static void SetCompatibleMode();
 
     // static BLAS functions
@@ -579,6 +586,9 @@ class MATH_API CPUMatrix : public BaseMatrix<ElemType>
     void Clear();
 
     void ScatterValues(ElemType* indices, ElemType* value, ElemType* data, ElemType alpha, size_t num_indices, size_t rows, size_t cols, size_t indices_step = 1);
+
+private:
+    static int m_optimizationFlags;
 };
 
 typedef CPUMatrix<float> CPUSingleMatrix;

diff --git a/Source/Math/CPUMatrixDouble.cpp b/Source/Math/CPUMatrixDouble.cpp
@@ -9,4 +9,5 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     // explicit instantiations, due to CPUMatrix being too big and causing VS2015 cl crash.
     template class MATH_API CPUMatrix<double>;
+    template<> int CPUMatrix<double>::m_optimizationFlags = 0;
 }}}
diff --git a/Source/Math/CPUMatrixFloat.cpp b/Source/Math/CPUMatrixFloat.cpp
@@ -21,4 +21,5 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     // explicit instantiations, due to CPUMatrix being too big and causing VS2015 cl crash.
     template class MATH_API CPUMatrix<float>;
+    template<> int CPUMatrix<float>::m_optimizationFlags = CPUMatrix<float>::OPT_EVAL_WITH_MKL; // enable eval MKL optimization by default
 }}}
diff --git a/Source/Math/CPUMatrixHalf.cpp b/Source/Math/CPUMatrixHalf.cpp
@@ -115,6 +115,7 @@ void CPUMatrix<half>::AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, con
 
 // explicit instantiations, due to CPUMatrix being too big and causing VS2015 cl crash.
 template class MATH_API CPUMatrix<half>;
+template<> int CPUMatrix<half>::m_optimizationFlags = 0;
 
 // instantiate templated methods
 template void CPUMatrix<float>::AdaDelta(CPUMatrix<float>& gradients, CPUMatrix<float>& functionValues, float learningRate, float rho, float epsilon);

diff --git a/Source/Math/CPUMatrixImpl.h b/Source/Math/CPUMatrixImpl.h
@@ -2509,9 +2509,13 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignHardmaxOf(const CPUMatrix<ElemTy
     if (this != &a)
         RequireSize(a.GetNumRows(), a.GetNumCols());
 
+    bool isInplace = (us.Data() == a.Data());
+
+    if (!isInplace)
+        memset(us.Data(), 0, a.GetNumElements() * sizeof(ElemType));
+
     if (isColWise)
     {
-#pragma omp parallel for
         foreach_column (j, a)
         {
             // we need to extract max
@@ -2526,13 +2530,14 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignHardmaxOf(const CPUMatrix<ElemTy
                 }
             }
 
-            foreach_row (i, us)
-                us(i, j) = (i == maxI) ? 1.0f : 0.0f;
+            if (isInplace)
+                memset(us.Data() + j * a.GetNumRows(), 0, a.GetNumRows() * sizeof(ElemType));
+
+            us(maxI, j) = 1.0f;
         }
     }
     else
     {
-#pragma omp parallel for
         foreach_row (i, a)
         {
             // we need to extract max
@@ -2547,8 +2552,13 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignHardmaxOf(const CPUMatrix<ElemTy
                 }
             }
 
-            foreach_column (j, us)
-                us(i, j) = (j == maxJ) ? 1.0f : 0.0f;
+            if (isInplace)
+            {
+                foreach_column(j, us)
+                    us(i, j) = (j == maxJ) ? 1.0f : 0.0f;
+            }
+            else
+                us(i, maxJ) = 1.0f;
         }
     }
 
@@ -7054,6 +7064,18 @@ void CPUMatrix<ElemType>::SetCompatibleMode()
     // #endif
 }
 
+template <class ElemType>
+void CPUMatrix<ElemType>::SetOptimizationFlags(int flags)
+{
+    m_optimizationFlags = flags;
+}
+
+template <class ElemType>
+int CPUMatrix<ElemType>::GetOptimizationFlags()
+{
+    return m_optimizationFlags;
+}
+
 // -----------------------------------------------------------------------
 // entry points from Matrix.cpp; calls into CPUMatrixTensorOpImpl
 // -----------------------------------------------------------------------

diff --git a/Source/Math/CPUMatrixTensorImpl.h b/Source/Math/CPUMatrixTensorImpl.h
@@ -389,6 +389,25 @@ static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType
 // entry points from Matrix.cpp; also map op to a lambda
 // -----------------------------------------------------------------------
 
+// special tensor ops for inference speed
+template <class ElemType>
+bool CPUMatrixSpecialUnaryTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+    const array<size_t, 2>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
+
+template <class ElemType>
+bool CPUMatrixSpecialBinaryTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+    const array<size_t, 3>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
+
+template <class ElemType>
+bool CPUMatrixSpecialTernaryTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+    const array<size_t, 4>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
+
 // perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
 // This maps 'op' to a lambda.
 template <class ElemType>
@@ -404,6 +423,12 @@ void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, CPUMatri
         reductionOp != ElementWiseOperator::opElementwiseProduct)
         InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented.");
 
+#ifdef USE_MKL
+    if (!!(CPUMatrix<ElemType>::GetOptimizationFlags() & CPUMatrix<ElemType>::OPT_EVAL_WITH_MKL) &&
+        CPUMatrixSpecialUnaryTensorOpImpl(beta, a, o, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides))
+        return;
+#endif
+
 // TODO: Change the lambda to take a pointer and a number of elements, so that we can pass it 1 or 4 elements, in order for it to SSE-vectorize.
 #define CaseUnaryTensorOp(oper)                                                        \
     case ElementWiseOperator::op##oper:                                                \
@@ -433,6 +458,12 @@ void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CP
     if (reductionOp != ElementWiseOperator::opSum)
         InvalidArgument("TensorOp (binary): The only permitted binary reduction operation is opSum.");
 
+#ifdef USE_MKL
+    if (!!(CPUMatrix<ElemType>::GetOptimizationFlags() & CPUMatrix<ElemType>::OPT_EVAL_WITH_MKL) &&
+        CPUMatrixSpecialBinaryTensorOpImpl(beta, a, b, o, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides))
+        return;
+#endif
+
 #define CaseBinaryTensorOp(oper)                                                       \
     case ElementWiseOperator::op##oper:                                                \
         return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3>& pp) \
@@ -461,6 +492,12 @@ void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CP
     if (reductionOp != ElementWiseOperator::opSum)
         InvalidArgument("TensorOp: The only permitted ternary reduction operation is opSum.");
 
+#ifdef USE_MKL
+    if (!!(CPUMatrix<ElemType>::GetOptimizationFlags() & CPUMatrix<ElemType>::OPT_EVAL_WITH_MKL) &&
+        CPUMatrixSpecialTernaryTensorOpImpl(beta, a, b, c, o, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides))
+        return;
+#endif
+
 #define CaseTernaryTensorOp(oper)                                                      \
     case ElementWiseOperator::op##oper:                                                \
         return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 4>& pp) \