Added batch norm per-activation feed forward implementation.

aschi87 · Feb 12, 2016 · 4e9483b · 4e9483b
1 parent 6465c80
commit 4e9483b
Show file tree

Hide file tree

Showing 9 changed files with 602 additions and 23 deletions.
diff --git a/Source/Math/ConvolutionEngine.cpp b/Source/Math/ConvolutionEngine.cpp
@@ -448,7 +448,7 @@ class DefaultConvolutionEngineFactory : public ConvolutionEngineFactory<ElemType
         return std::make_unique<PoolDesc>(kind, w, h, wStride, hStride, wPad, hPad);
     }
 
-    ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, size_t maxTempMemSizeInSamples) override
+    ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, size_t maxTempMemSizeInSamples, BatchNormImpl /*bnImpl*/) override
     {
         return std::make_unique<DefaultConvolutionEngine<ElemType>>(deviceId, maxTempMemSizeInSamples);
     }

diff --git a/Source/Math/ConvolutionEngine.h b/Source/Math/ConvolutionEngine.h
@@ -289,6 +289,13 @@ class MATH_API PoolingEngine
     PoolingEngine& operator=(PoolingEngine&&) = delete;
 };
 
+// REVIEW alexeyk: this is a temporary hack until we find a better place for poor BatchNorm.
+enum class BatchNormImpl
+{
+    CuDnn,
+    Cntk
+};
+
 template <class ElemType>
 class MATH_API ConvolutionEngineFactory
 {
@@ -316,7 +323,7 @@ class MATH_API ConvolutionEngineFactory
     virtual PoolDescPtr CreatePoolDescriptor(PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) = 0;
     // virtual Tensor4DPtr CreateLrnDescriptor() = 0;
 
-    virtual ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, size_t maxTempMemSizeInSamples) = 0;
+    virtual ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl = BatchNormImpl::CuDnn) = 0;
     virtual PoolEnginePtr CreatePoolEngine(DEVICEID_TYPE deviceId) = 0;
 
     enum class EngineType

diff --git a/Source/Math/CuDnnConvolutionEngine.cpp → Source/Math/CuDnnConvolutionEngine.cu b/Source/Math/CuDnnConvolutionEngine.cpp → Source/Math/CuDnnConvolutionEngine.cu
@@ -8,6 +8,7 @@
 #include "GPUMatrix.h"
 #ifdef USE_CUDNN
 #include <cudnn.h>
+#include "CuDnnConvolutionEngine.cuh"
 
 template <>
 const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x)
@@ -37,6 +38,32 @@ bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE deviceId
 #endif
 }
 
+void CudaTimer::Start()
+{
+    cudaEvent_t start;
+    cudaEvent_t stop;
+    if (m_start != nullptr)
+        CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_start)));
+    if (m_stop != nullptr)
+        CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_stop)));
+    CUDA_CALL(cudaEventCreate(&start));
+    CUDA_CALL(cudaEventCreate(&stop));
+    m_start = start;
+    m_stop = stop;
+    CUDA_CALL(cudaEventRecord(start, GetStream()));
+}
+void CudaTimer::Stop()
+{
+    CUDA_CALL(cudaEventRecord(reinterpret_cast<cudaEvent_t>(m_stop), GetStream()));
+    CUDA_CALL(cudaEventSynchronize(reinterpret_cast<cudaEvent_t>(m_stop)));
+}
+float CudaTimer::Elapsed()
+{
+    float ms;
+    CUDA_CALL(cudaEventElapsedTime(&ms, reinterpret_cast<cudaEvent_t>(m_start), reinterpret_cast<cudaEvent_t>(m_stop)));
+    return ms;
+}
+
 #ifdef USE_CUDNN
 
 class CuDnnTensor4D : public ConvolutionTensor4D
@@ -56,7 +83,7 @@ class CuDnnTensor4D : public ConvolutionTensor4D
         return m_tensor;
     }
 
-    ~CuDnnTensor4D()
+    ~CuDnnTensor4D() noexcept
     {
         if (m_tensor != nullptr)
         {
@@ -94,7 +121,7 @@ class CuDnnFilter : public ConvolutionFilter
         return m_filter;
     }
 
-    ~CuDnnFilter()
+    ~CuDnnFilter() noexcept
     {
         if (m_filter != nullptr)
         {
@@ -126,7 +153,7 @@ class CuDnnConvolutionDescriptor : public ConvolutionDescriptor
         return m_conv;
     }
 
-    ~CuDnnConvolutionDescriptor()
+    ~CuDnnConvolutionDescriptor() noexcept
     {
         if (m_conv != nullptr)
         {
@@ -161,7 +188,7 @@ class CuDnnPoolingDescriptor : public PoolingDescriptor
         return m_pool;
     }
 
-    ~CuDnnPoolingDescriptor()
+    ~CuDnnPoolingDescriptor() noexcept
     {
         if (m_pool != nullptr)
         {
@@ -233,11 +260,11 @@ class CuDnnConvolutionEngine : public ConvolutionEngine<ElemType>
     using typename Base::Filter;
     using typename Base::ConvDesc;
 
-    CuDnnConvolutionEngine(size_t maxTempMemSizeInSamples)
-        : m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_cudnn(nullptr), m_curMBSize(0)
+    CuDnnConvolutionEngine(size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl)
+        : m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_bnImpl(bnImpl), m_stream(GetStream()), m_cudnn(nullptr), m_curMBSize(0)
     {
         CUDNN_CALL(cudnnCreate(&m_cudnn));
-        CUDNN_CALL(cudnnSetStream(m_cudnn, GetStream()));
+        CUDNN_CALL(cudnnSetStream(m_cudnn, m_stream));
         m_fwdAlgo.status = CUDNN_STATUS_NOT_INITIALIZED;
         m_backDataAlgo.status = CUDNN_STATUS_NOT_INITIALIZED;
         m_backFiltAlgo.status = CUDNN_STATUS_NOT_INITIALIZED;
@@ -341,7 +368,6 @@ class CuDnnConvolutionEngine : public ConvolutionEngine<ElemType>
                         bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out, Mat& saveMean, Mat& saveInvStdDev) override
     {
         const size_t crowIn = inT.w() * inT.h() * inT.c();
-        UNUSED(crowIn); // crowIn used only in asserts.
         if (spatial)
         {
             assert(scaleBiasT.c() == inT.c());
@@ -368,9 +394,23 @@ class CuDnnConvolutionEngine : public ConvolutionEngine<ElemType>
         assert(saveMean.GetNumElements() >= runMean.GetNumElements());
         assert(saveInvStdDev.GetNumElements() >= runInvStdDev.GetNumElements());
 
-        cudnnBatchNormMode_t mode = spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
-        CUDNN_CALL(cudnnBatchNormalizationForwardTraining(m_cudnn, mode, &C::One, &C::Zero, t(inT), ptr(in), t(inT), ptr(out),
-                                                          t(scaleBiasT), ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
+        if (m_bnImpl == BatchNormImpl::CuDnn)
+        {
+            cudnnBatchNormMode_t mode = spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
+            CUDNN_CALL(cudnnBatchNormalizationForwardTraining(m_cudnn, mode, &C::One, &C::Zero, t(inT), ptr(in), t(inT), ptr(out),
+                t(scaleBiasT), ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev), 
+                CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
+        }
+        else
+        {
+            if (spatial)
+                assert(false);
+            else
+            {
+                CUDA_CALL(BatchNormalizationForwardTraining(crowIn, inT.n(), ptr(in), ptr(out), ptr(scale), ptr(bias),
+                    CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev), m_stream));
+            }
+        }
     }
 
     void NormalizeBatchInference(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
@@ -511,7 +551,9 @@ class CuDnnConvolutionEngine : public ConvolutionEngine<ElemType>
 
     // REVIEW alexeyk: currently limit is set once in ctor though in CNTK it can be, theoretically, changed in runtime.
     size_t m_maxTempMemSizeInSamples;
+    BatchNormImpl m_bnImpl;
     cudnnHandle_t m_cudnn;
+    cudaStream_t m_stream;
     // Current mini-batch size, needed for re-computing statistics in auto-tuner.
     size_t m_curMBSize;
     cudnnConvolutionFwdAlgoPerf_t m_fwdAlgo;
@@ -582,6 +624,7 @@ typename CuDnnConvolutionEngineFactory<ElemType>::Tensor4DPtr CuDnnConvolutionEn
 {
     // REVIEW alexeyk: assert fires in GCC but not in VC++.
     // static_assert(false, "cuDNN engine currently supports only single and double precision tensors.");
+    RuntimeError("Not implemented.");
 }
 template <>
 typename CuDnnConvolutionEngineFactory<float>::Tensor4DPtr CuDnnConvolutionEngineFactory<float>::CreateTensor(size_t w, size_t h, size_t c, size_t n)
@@ -599,6 +642,7 @@ typename CuDnnConvolutionEngineFactory<ElemType>::FilterPtr CuDnnConvolutionEngi
 {
     // REVIEW alexeyk: assert fires in GCC but not in VC++.
     // static_assert(false, "cuDNN engine currently supports only single and double precision filters.");
+    RuntimeError("Not implemented.");
 }
 template <>
 typename CuDnnConvolutionEngineFactory<float>::FilterPtr CuDnnConvolutionEngineFactory<float>::CreateFilter(size_t w, size_t h, size_t c, size_t k)
@@ -629,9 +673,9 @@ typename CuDnnConvolutionEngineFactory<ElemType>::PoolDescPtr CuDnnConvolutionEn
 
 template <class ElemType>
 typename CuDnnConvolutionEngineFactory<ElemType>::ConvEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvEngine(
-    DEVICEID_TYPE /*deviceId*/, size_t maxTempMemSizeInSamples)
+    DEVICEID_TYPE /*deviceId*/, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl)
 {
-    return std::make_unique<CuDnnConvolutionEngine<ElemType>>(maxTempMemSizeInSamples);
+    return std::make_unique<CuDnnConvolutionEngine<ElemType>>(maxTempMemSizeInSamples, bnImpl);
 }
 
 template <class ElemType>
@@ -670,7 +714,7 @@ typename CuDnnConvolutionEngineFactory<ElemType>::PoolDescPtr CuDnnConvolutionEn
 }
 
 template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::ConvEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvEngine(DEVICEID_TYPE, size_t)
+typename CuDnnConvolutionEngineFactory<ElemType>::ConvEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvEngine(DEVICEID_TYPE, size_t, BatchNormImpl)
 {
     RuntimeError("The code is compiled without USE_CUDNN macro.");
 }