Skip to content

Commit

Permalink
Fast tensor ops using MKL
Browse files Browse the repository at this point in the history
- Accelerates some common tensor ops in Intel CPU inference for float32, especially for fully connected networks
- Can be turned on/off by cntk.cntk_py.enable_cpueval_optimization()/cntk.cntk_py.disable_cpueval_optimization()
  • Loading branch information
KeDengMS committed Feb 9, 2018
1 parent 08bcbdc commit 19719a6
Show file tree
Hide file tree
Showing 14 changed files with 343 additions and 7 deletions.
6 changes: 5 additions & 1 deletion Documentation/current_iteration.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,8 @@ C.debugging.enable_profiler() # optional
#<trainer|evaluator|function> executions
<trainer|evaluator|function>.print_node_timing()
C.debugging.stop_profiler()
```
```

## CPU inference performance improvements using MKL
- Accelerates some common tensor ops in Intel CPU inference for float32, especially for fully connected networks
- Can be turned on/off by cntk.cntk_py.enable_cpueval_optimization()/cntk.cntk_py.disable_cpueval_optimization()
1 change: 1 addition & 0 deletions Examples/Image/Classification/MLP/Python/SimpleMNIST.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def simple_mnist(tensorboard_logdir=None):
C.debugging.start_profiler()
C.debugging.enable_profiler()
C.debugging.set_node_timing(True)
#C.cntk_py.disable_cpueval_optimization() # uncomment this to check CPU eval perf without optimization

test_minibatch_size = 1024
num_samples = 10000
Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ MATH_SRC =\
$(SOURCEDIR)/Math/CPUMatrixTensorFloat.cpp \
$(SOURCEDIR)/Math/CPUMatrixTensorDouble.cpp \
$(SOURCEDIR)/Math/CPUMatrixTensorHalf.cpp \
$(SOURCEDIR)/Math/CPUMatrixTensorSpecial.cpp \
$(SOURCEDIR)/Math/CPURNGHandle.cpp \
$(SOURCEDIR)/Math/CPUSparseMatrix.cpp \
$(SOURCEDIR)/Math/ConvolutionEngine.cpp \
Expand Down
3 changes: 3 additions & 0 deletions Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,9 @@ namespace CNTK
CNTK_API void EnableNodeTiming();
CNTK_API void DisableNodeTimeing();

CNTK_API void EnableCPUEvalOptimization();
CNTK_API void DisableCPUEvalOptimization();

CNTK_API bool AreEquivalent(const ::CNTK::FunctionPtr& f1, const ::CNTK::FunctionPtr& f2);
CNTK_API bool AreEquivalent(const ::CNTK::Variable& v1, const ::CNTK::Variable& v2, bool allowParameterAndConstantsEquivalence = false);

Expand Down
15 changes: 15 additions & 0 deletions Source/CNTKv2LibraryDll/Common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,21 @@ namespace CNTK
Microsoft::MSR::CNTK::Globals::SetNodeTiming(false);
}

void EnableCPUEvalOptimization()
{
// optimization is only for float
int flags = Microsoft::MSR::CNTK::CPUMatrix<float>::GetOptimizationFlags();
flags |= Microsoft::MSR::CNTK::CPUMatrix<float>::OPT_EVAL_WITH_MKL;
Microsoft::MSR::CNTK::CPUMatrix<float>::SetOptimizationFlags(Microsoft::MSR::CNTK::CPUMatrix<float>::OPT_EVAL_WITH_MKL);
}

void DisableCPUEvalOptimization()
{
int flags = Microsoft::MSR::CNTK::CPUMatrix<float>::GetOptimizationFlags();
flags &= ~Microsoft::MSR::CNTK::CPUMatrix<float>::OPT_EVAL_WITH_MKL;
Microsoft::MSR::CNTK::CPUMatrix<float>::SetOptimizationFlags(flags);
}

bool AreEquivalent(const Variable& var1, const Variable& var2, bool allowParameterAndConstantsEquivalence)
{
bool areDynamicAxesCompatible = (var1.DynamicAxes().size() == var2.DynamicAxes().size());
Expand Down
10 changes: 10 additions & 0 deletions Source/Math/CPUMatrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,13 @@ class MATH_API CPUMatrix : public BaseMatrix<ElemType>
static int SetNumThreads(int numThreads);
static int GetMaxNumThreads();

enum OptimizationFlag
{
OPT_EVAL_WITH_MKL = 1, // using Intel MKL functions for evaluation performance
};
static void SetOptimizationFlags(int flags);
static int GetOptimizationFlags();

static void SetCompatibleMode();

// static BLAS functions
Expand Down Expand Up @@ -579,6 +586,9 @@ class MATH_API CPUMatrix : public BaseMatrix<ElemType>
void Clear();

void ScatterValues(ElemType* indices, ElemType* value, ElemType* data, ElemType alpha, size_t num_indices, size_t rows, size_t cols, size_t indices_step = 1);

private:
static int m_optimizationFlags;
};

typedef CPUMatrix<float> CPUSingleMatrix;
Expand Down
1 change: 1 addition & 0 deletions Source/Math/CPUMatrixDouble.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ namespace Microsoft { namespace MSR { namespace CNTK {

// explicit instantiations, due to CPUMatrix being too big and causing VS2015 cl crash.
template class MATH_API CPUMatrix<double>;
template<> int CPUMatrix<double>::m_optimizationFlags = 0;
}}}
1 change: 1 addition & 0 deletions Source/Math/CPUMatrixFloat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ namespace Microsoft { namespace MSR { namespace CNTK {

// explicit instantiations, due to CPUMatrix being too big and causing VS2015 cl crash.
template class MATH_API CPUMatrix<float>;
template<> int CPUMatrix<float>::m_optimizationFlags = CPUMatrix<float>::OPT_EVAL_WITH_MKL; // enable eval MKL optimization by default
}}}
1 change: 1 addition & 0 deletions Source/Math/CPUMatrixHalf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ void CPUMatrix<half>::AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, con

// explicit instantiations, due to CPUMatrix being too big and causing VS2015 cl crash.
template class MATH_API CPUMatrix<half>;
template<> int CPUMatrix<half>::m_optimizationFlags = 0;

// instantiate templated methods
template void CPUMatrix<float>::AdaDelta(CPUMatrix<float>& gradients, CPUMatrix<float>& functionValues, float learningRate, float rho, float epsilon);
Expand Down
34 changes: 28 additions & 6 deletions Source/Math/CPUMatrixImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -2509,9 +2509,13 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignHardmaxOf(const CPUMatrix<ElemTy
if (this != &a)
RequireSize(a.GetNumRows(), a.GetNumCols());

bool isInplace = (us.Data() == a.Data());

if (!isInplace)
memset(us.Data(), 0, a.GetNumElements() * sizeof(ElemType));

if (isColWise)
{
#pragma omp parallel for
foreach_column (j, a)
{
// we need to extract max
Expand All @@ -2526,13 +2530,14 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignHardmaxOf(const CPUMatrix<ElemTy
}
}

foreach_row (i, us)
us(i, j) = (i == maxI) ? 1.0f : 0.0f;
if (isInplace)
memset(us.Data() + j * a.GetNumRows(), 0, a.GetNumRows() * sizeof(ElemType));

us(maxI, j) = 1.0f;
}
}
else
{
#pragma omp parallel for
foreach_row (i, a)
{
// we need to extract max
Expand All @@ -2547,8 +2552,13 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignHardmaxOf(const CPUMatrix<ElemTy
}
}

foreach_column (j, us)
us(i, j) = (j == maxJ) ? 1.0f : 0.0f;
if (isInplace)
{
foreach_column(j, us)
us(i, j) = (j == maxJ) ? 1.0f : 0.0f;
}
else
us(i, maxJ) = 1.0f;
}
}

Expand Down Expand Up @@ -7054,6 +7064,18 @@ void CPUMatrix<ElemType>::SetCompatibleMode()
// #endif
}

template <class ElemType>
void CPUMatrix<ElemType>::SetOptimizationFlags(int flags)
{
m_optimizationFlags = flags;
}

template <class ElemType>
int CPUMatrix<ElemType>::GetOptimizationFlags()
{
return m_optimizationFlags;
}

// -----------------------------------------------------------------------
// entry points from Matrix.cpp; calls into CPUMatrixTensorOpImpl
// -----------------------------------------------------------------------
Expand Down
37 changes: 37 additions & 0 deletions Source/Math/CPUMatrixTensorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,25 @@ static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType
// entry points from Matrix.cpp; also map op to a lambda
// -----------------------------------------------------------------------

// special tensor ops for inference speed
template <class ElemType>
bool CPUMatrixSpecialUnaryTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 2>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);

template <class ElemType>
bool CPUMatrixSpecialBinaryTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 3>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);

template <class ElemType>
bool CPUMatrixSpecialTernaryTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
const array<size_t, 4>& offsets,
const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);

// perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
// This maps 'op' to a lambda.
template <class ElemType>
Expand All @@ -404,6 +423,12 @@ void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, CPUMatri
reductionOp != ElementWiseOperator::opElementwiseProduct)
InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented.");

#ifdef USE_MKL
if (!!(CPUMatrix<ElemType>::GetOptimizationFlags() & CPUMatrix<ElemType>::OPT_EVAL_WITH_MKL) &&
CPUMatrixSpecialUnaryTensorOpImpl(beta, a, o, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides))
return;
#endif

// TODO: Change the lambda to take a pointer and a number of elements, so that we can pass it 1 or 4 elements, in order for it to SSE-vectorize.
#define CaseUnaryTensorOp(oper) \
case ElementWiseOperator::op##oper: \
Expand Down Expand Up @@ -433,6 +458,12 @@ void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CP
if (reductionOp != ElementWiseOperator::opSum)
InvalidArgument("TensorOp (binary): The only permitted binary reduction operation is opSum.");

#ifdef USE_MKL
if (!!(CPUMatrix<ElemType>::GetOptimizationFlags() & CPUMatrix<ElemType>::OPT_EVAL_WITH_MKL) &&
CPUMatrixSpecialBinaryTensorOpImpl(beta, a, b, o, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides))
return;
#endif

#define CaseBinaryTensorOp(oper) \
case ElementWiseOperator::op##oper: \
return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3>& pp) \
Expand Down Expand Up @@ -461,6 +492,12 @@ void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CP
if (reductionOp != ElementWiseOperator::opSum)
InvalidArgument("TensorOp: The only permitted ternary reduction operation is opSum.");

#ifdef USE_MKL
if (!!(CPUMatrix<ElemType>::GetOptimizationFlags() & CPUMatrix<ElemType>::OPT_EVAL_WITH_MKL) &&
CPUMatrixSpecialTernaryTensorOpImpl(beta, a, b, c, o, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides))
return;
#endif

#define CaseTernaryTensorOp(oper) \
case ElementWiseOperator::op##oper: \
return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 4>& pp) \
Expand Down
Loading

0 comments on commit 19719a6

Please sign in to comment.