Skip to content

Commit

Permalink
Added batch norm per-activation feed forward implementation.
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexey Kamenev committed Feb 12, 2016
1 parent 6465c80 commit 4e9483b
Show file tree
Hide file tree
Showing 9 changed files with 602 additions and 23 deletions.
2 changes: 1 addition & 1 deletion Source/Math/ConvolutionEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ class DefaultConvolutionEngineFactory : public ConvolutionEngineFactory<ElemType
return std::make_unique<PoolDesc>(kind, w, h, wStride, hStride, wPad, hPad);
}

ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, size_t maxTempMemSizeInSamples) override
ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, size_t maxTempMemSizeInSamples, BatchNormImpl /*bnImpl*/) override
{
return std::make_unique<DefaultConvolutionEngine<ElemType>>(deviceId, maxTempMemSizeInSamples);
}
Expand Down
9 changes: 8 additions & 1 deletion Source/Math/ConvolutionEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,13 @@ class MATH_API PoolingEngine
PoolingEngine& operator=(PoolingEngine&&) = delete;
};

// REVIEW alexeyk: this is a temporary hack until we find a better place for poor BatchNorm.
enum class BatchNormImpl
{
CuDnn,
Cntk
};

template <class ElemType>
class MATH_API ConvolutionEngineFactory
{
Expand Down Expand Up @@ -316,7 +323,7 @@ class MATH_API ConvolutionEngineFactory
virtual PoolDescPtr CreatePoolDescriptor(PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) = 0;
// virtual Tensor4DPtr CreateLrnDescriptor() = 0;

virtual ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, size_t maxTempMemSizeInSamples) = 0;
virtual ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl = BatchNormImpl::CuDnn) = 0;
virtual PoolEnginePtr CreatePoolEngine(DEVICEID_TYPE deviceId) = 0;

enum class EngineType
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "GPUMatrix.h"
#ifdef USE_CUDNN
#include <cudnn.h>
#include "CuDnnConvolutionEngine.cuh"

template <>
const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x)
Expand Down Expand Up @@ -37,6 +38,32 @@ bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE deviceId
#endif
}

void CudaTimer::Start()
{
cudaEvent_t start;
cudaEvent_t stop;
if (m_start != nullptr)
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_start)));
if (m_stop != nullptr)
CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_stop)));
CUDA_CALL(cudaEventCreate(&start));
CUDA_CALL(cudaEventCreate(&stop));
m_start = start;
m_stop = stop;
CUDA_CALL(cudaEventRecord(start, GetStream()));
}
void CudaTimer::Stop()
{
CUDA_CALL(cudaEventRecord(reinterpret_cast<cudaEvent_t>(m_stop), GetStream()));
CUDA_CALL(cudaEventSynchronize(reinterpret_cast<cudaEvent_t>(m_stop)));
}
float CudaTimer::Elapsed()
{
float ms;
CUDA_CALL(cudaEventElapsedTime(&ms, reinterpret_cast<cudaEvent_t>(m_start), reinterpret_cast<cudaEvent_t>(m_stop)));
return ms;
}

#ifdef USE_CUDNN

class CuDnnTensor4D : public ConvolutionTensor4D
Expand All @@ -56,7 +83,7 @@ class CuDnnTensor4D : public ConvolutionTensor4D
return m_tensor;
}

~CuDnnTensor4D()
~CuDnnTensor4D() noexcept
{
if (m_tensor != nullptr)
{
Expand Down Expand Up @@ -94,7 +121,7 @@ class CuDnnFilter : public ConvolutionFilter
return m_filter;
}

~CuDnnFilter()
~CuDnnFilter() noexcept
{
if (m_filter != nullptr)
{
Expand Down Expand Up @@ -126,7 +153,7 @@ class CuDnnConvolutionDescriptor : public ConvolutionDescriptor
return m_conv;
}

~CuDnnConvolutionDescriptor()
~CuDnnConvolutionDescriptor() noexcept
{
if (m_conv != nullptr)
{
Expand Down Expand Up @@ -161,7 +188,7 @@ class CuDnnPoolingDescriptor : public PoolingDescriptor
return m_pool;
}

~CuDnnPoolingDescriptor()
~CuDnnPoolingDescriptor() noexcept
{
if (m_pool != nullptr)
{
Expand Down Expand Up @@ -233,11 +260,11 @@ class CuDnnConvolutionEngine : public ConvolutionEngine<ElemType>
using typename Base::Filter;
using typename Base::ConvDesc;

CuDnnConvolutionEngine(size_t maxTempMemSizeInSamples)
: m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_cudnn(nullptr), m_curMBSize(0)
CuDnnConvolutionEngine(size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl)
: m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_bnImpl(bnImpl), m_stream(GetStream()), m_cudnn(nullptr), m_curMBSize(0)
{
CUDNN_CALL(cudnnCreate(&m_cudnn));
CUDNN_CALL(cudnnSetStream(m_cudnn, GetStream()));
CUDNN_CALL(cudnnSetStream(m_cudnn, m_stream));
m_fwdAlgo.status = CUDNN_STATUS_NOT_INITIALIZED;
m_backDataAlgo.status = CUDNN_STATUS_NOT_INITIALIZED;
m_backFiltAlgo.status = CUDNN_STATUS_NOT_INITIALIZED;
Expand Down Expand Up @@ -341,7 +368,6 @@ class CuDnnConvolutionEngine : public ConvolutionEngine<ElemType>
bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out, Mat& saveMean, Mat& saveInvStdDev) override
{
const size_t crowIn = inT.w() * inT.h() * inT.c();
UNUSED(crowIn); // crowIn used only in asserts.
if (spatial)
{
assert(scaleBiasT.c() == inT.c());
Expand All @@ -368,9 +394,23 @@ class CuDnnConvolutionEngine : public ConvolutionEngine<ElemType>
assert(saveMean.GetNumElements() >= runMean.GetNumElements());
assert(saveInvStdDev.GetNumElements() >= runInvStdDev.GetNumElements());

cudnnBatchNormMode_t mode = spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
CUDNN_CALL(cudnnBatchNormalizationForwardTraining(m_cudnn, mode, &C::One, &C::Zero, t(inT), ptr(in), t(inT), ptr(out),
t(scaleBiasT), ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
if (m_bnImpl == BatchNormImpl::CuDnn)
{
cudnnBatchNormMode_t mode = spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
CUDNN_CALL(cudnnBatchNormalizationForwardTraining(m_cudnn, mode, &C::One, &C::Zero, t(inT), ptr(in), t(inT), ptr(out),
t(scaleBiasT), ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev),
CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
}
else
{
if (spatial)
assert(false);
else
{
CUDA_CALL(BatchNormalizationForwardTraining(crowIn, inT.n(), ptr(in), ptr(out), ptr(scale), ptr(bias),
CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev), m_stream));
}
}
}

void NormalizeBatchInference(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
Expand Down Expand Up @@ -511,7 +551,9 @@ class CuDnnConvolutionEngine : public ConvolutionEngine<ElemType>

// REVIEW alexeyk: currently limit is set once in ctor though in CNTK it can be, theoretically, changed in runtime.
size_t m_maxTempMemSizeInSamples;
BatchNormImpl m_bnImpl;
cudnnHandle_t m_cudnn;
cudaStream_t m_stream;
// Current mini-batch size, needed for re-computing statistics in auto-tuner.
size_t m_curMBSize;
cudnnConvolutionFwdAlgoPerf_t m_fwdAlgo;
Expand Down Expand Up @@ -582,6 +624,7 @@ typename CuDnnConvolutionEngineFactory<ElemType>::Tensor4DPtr CuDnnConvolutionEn
{
// REVIEW alexeyk: assert fires in GCC but not in VC++.
// static_assert(false, "cuDNN engine currently supports only single and double precision tensors.");
RuntimeError("Not implemented.");
}
template <>
typename CuDnnConvolutionEngineFactory<float>::Tensor4DPtr CuDnnConvolutionEngineFactory<float>::CreateTensor(size_t w, size_t h, size_t c, size_t n)
Expand All @@ -599,6 +642,7 @@ typename CuDnnConvolutionEngineFactory<ElemType>::FilterPtr CuDnnConvolutionEngi
{
// REVIEW alexeyk: assert fires in GCC but not in VC++.
// static_assert(false, "cuDNN engine currently supports only single and double precision filters.");
RuntimeError("Not implemented.");
}
template <>
typename CuDnnConvolutionEngineFactory<float>::FilterPtr CuDnnConvolutionEngineFactory<float>::CreateFilter(size_t w, size_t h, size_t c, size_t k)
Expand Down Expand Up @@ -629,9 +673,9 @@ typename CuDnnConvolutionEngineFactory<ElemType>::PoolDescPtr CuDnnConvolutionEn

template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::ConvEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvEngine(
DEVICEID_TYPE /*deviceId*/, size_t maxTempMemSizeInSamples)
DEVICEID_TYPE /*deviceId*/, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl)
{
return std::make_unique<CuDnnConvolutionEngine<ElemType>>(maxTempMemSizeInSamples);
return std::make_unique<CuDnnConvolutionEngine<ElemType>>(maxTempMemSizeInSamples, bnImpl);
}

template <class ElemType>
Expand Down Expand Up @@ -670,7 +714,7 @@ typename CuDnnConvolutionEngineFactory<ElemType>::PoolDescPtr CuDnnConvolutionEn
}

template <class ElemType>
typename CuDnnConvolutionEngineFactory<ElemType>::ConvEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvEngine(DEVICEID_TYPE, size_t)
typename CuDnnConvolutionEngineFactory<ElemType>::ConvEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvEngine(DEVICEID_TYPE, size_t, BatchNormImpl)
{
RuntimeError("The code is compiled without USE_CUDNN macro.");
}
Expand Down
Loading

0 comments on commit 4e9483b

Please sign in to comment.