cudnn: enabled build on Linux with cuDNN.

wendalegood · Dec 12, 2015 · 3023f11 · 3023f11
1 parent 895c10a
commit 3023f11
Show file tree

Hide file tree

Showing 5 changed files with 174 additions and 65 deletions.
diff --git a/Makefile b/Makefile
@@ -23,6 +23,8 @@
 #   If not specified, GPU will not be enabled
 # CUB_PATH= path to NVIDIA CUB installation, so $(CUB_PATH)/cub/cub.cuh exists
 #   defaults to /usr/local/cub-1.4.1
+# CUDNN_PATH= path to NVIDIA cuDNN installation so $(CUDNN_PATH)/cuda/include/cudnn.h exists
+#   If not specified, CNTK will be be built without cuDNN.
 # KALDI_PATH= Path to Kaldi
 #   If not specified, Kaldi plugins will not be built
 # OPENCV_PATH= path to OpenCV 3.0.0 installation, so $(OPENCV_PATH) exists
@@ -102,6 +104,13 @@ ifdef CUDA_PATH
   LIBPATH += $(CUDA_PATH)/lib64
   LIBS += -lcublas -lcudart -lcuda -lcurand -lcusparse -lnvidia-ml
 
+# Set up cuDNN if needed
+  ifdef CUDNN_PATH
+    INCLUDEPATH += $(CUDNN_PATH)/cuda/include
+    LIBPATH += $(CUDNN_PATH)/cuda/lib64
+    LIBS += -lcudnn
+    CPPFLAGS +=-DUSE_CUDNN
+  endif
 else
   DEVICE = cpu
 
@@ -218,13 +227,15 @@ MATH_SRC =\
 	Math/Math/QuantizedMatrix.cpp \
 	Math/Math/Matrix.cpp \
 	Math/Math/CUDAPageLockedMemAllocator.cpp \
+	Math/Math/ConvolutionEngine.cpp \
 
 ifdef CUDA_PATH
 MATH_SRC +=\
 	Math/Math/GPUMatrix.cu \
 	Math/Math/GPUSparseMatrix.cu \
 	Math/Math/GPUWatcher.cu \
 	Math/Math/MatrixQuantizerGPU.cu \
+	Math/Math/CuDnnConvolutionEngine.cpp \
 
 else
 MATH_SRC +=\

diff --git a/Math/Math/ConvolutionEngine.cpp b/Math/Math/ConvolutionEngine.cpp
@@ -11,62 +11,15 @@
 namespace Microsoft { namespace MSR { namespace CNTK {
 
     template<class ElemType>
-    class DefaultConvolutionEngineFactory : public ConvolutionEngineFactory<ElemType>
+    class DefaultConvolutionEngine : public ConvolutionEngine<ElemType>
     {
     public:
-        DefaultConvolutionEngineFactory(DEVICEID_TYPE deviceId)
-            : ConvolutionEngineFactory<ElemType>(deviceId)
-        {
-        }
-
-    public:
-        Tensor4DPtr CreateTensor(size_t w, size_t h, size_t c, size_t n) override
-        {
-            return std::make_unique<ConvolutionTensor4D>(w, h, c, n);
-        }
-
-        FilterPtr CreateFilter(size_t w, size_t h, size_t c, size_t k) override
-        {
-            return std::make_unique<Filter>(w, h, c, k);
-        }
-
-        ConvDescPtr CreateConvDescriptor(const Tensor4D& /*inT*/, const Filter& /*filterT*/, 
-            size_t wStride, size_t hStride, bool padding) override
-        {
-            return std::make_unique<ConvDesc>(wStride, hStride, padding);
-        }
-
-        PoolDescPtr CreatePoolDescriptor(PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) override
-        {
-            return std::make_unique<PoolDesc>(kind, w, h, wStride, hStride, wPad, hPad);
-        }
-
-        ConvEnginePtr CreateConvEngine(size_t maxTempMemSizeInSamples) override
-        {
-            return std::make_unique<DefaultConvolutionEngine<ElemType>>(m_deviceId, maxTempMemSizeInSamples);
-        }
-
-        PoolEnginePtr CreatePoolEngine() override
-        {
-            return std::make_unique<DefaultPoolingEngine<ElemType>>();
-        }
-    };
+    	using Base = ConvolutionEngine<ElemType>;
+        using typename Base::Mat;
+        using typename Base::Tensor4D;
+        using typename Base::Filter;
+        using typename Base::ConvDesc;
 
-    template<class ElemType>
-    std::unique_ptr<ConvolutionEngineFactory<ElemType>> ConvolutionEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId)
-    {
-        // REVIEW alexeyk: make cuDNN default when running on GPU and compiled with cuDNN, add config parameter to enable runtime switch between implementations.
-        if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported())
-            return std::make_unique<CuDnnConvolutionEngineFactory<ElemType>>(deviceId);
-        return std::make_unique<DefaultConvolutionEngineFactory<ElemType>>(deviceId);
-    }
-
-    template class ConvolutionEngineFactory<float>;
-    template class ConvolutionEngineFactory<double>;
-
-    template<class ElemType>
-    class DefaultConvolutionEngine : public ConvolutionEngine<ElemType>
-    {
     public:
         DefaultConvolutionEngine(DEVICEID_TYPE deviceId, size_t maxTempMemSizeInSamples)
             : m_tempMatrix(deviceId), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples)
@@ -294,6 +247,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     class DefaultPoolingEngine : public PoolingEngine<ElemType>
     {
+    public:
+    	using Base = PoolingEngine<ElemType>;
+        using typename Base::Tensor4D;
+        using typename Base::PoolDesc;
+        using typename Base::Mat;
+
     public:
         void Forward(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out) override
         {
@@ -349,4 +308,75 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     template class PoolingEngine<float>;
     template class PoolingEngine<double>;
+
+    template<class ElemType>
+    class DefaultConvolutionEngineFactory : public ConvolutionEngineFactory<ElemType>
+    {
+    public:
+    	using Base = ConvolutionEngineFactory<ElemType>;
+        using typename Base::Tensor4D;
+        using typename Base::Tensor4DPtr;
+        using typename Base::Filter;
+        using typename Base::FilterPtr;
+        using typename Base::ConvDesc;
+        using typename Base::ConvDescPtr;
+        using typename Base::PoolDesc;
+        using typename Base::PoolDescPtr;
+
+        using typename Base::ConvEnginePtr;
+        using typename Base::PoolEnginePtr;
+
+        using Base::m_deviceId;
+
+    public:
+        DefaultConvolutionEngineFactory(DEVICEID_TYPE deviceId)
+            : ConvolutionEngineFactory<ElemType>(deviceId)
+        {
+        }
+
+    public:
+        Tensor4DPtr CreateTensor(size_t w, size_t h, size_t c, size_t n) override
+        {
+            return std::make_unique<ConvolutionTensor4D>(w, h, c, n);
+        }
+
+        FilterPtr CreateFilter(size_t w, size_t h, size_t c, size_t k) override
+        {
+            return std::make_unique<Filter>(w, h, c, k);
+        }
+
+        ConvDescPtr CreateConvDescriptor(const Tensor4D& /*inT*/, const Filter& /*filterT*/,
+            size_t wStride, size_t hStride, bool padding) override
+        {
+            return std::make_unique<ConvDesc>(wStride, hStride, padding);
+        }
+
+        PoolDescPtr CreatePoolDescriptor(typename PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) override
+        {
+            return std::make_unique<PoolDesc>(kind, w, h, wStride, hStride, wPad, hPad);
+        }
+
+        ConvEnginePtr CreateConvEngine(size_t maxTempMemSizeInSamples) override
+        {
+            return std::make_unique<DefaultConvolutionEngine<ElemType>>(m_deviceId, maxTempMemSizeInSamples);
+        }
+
+        PoolEnginePtr CreatePoolEngine() override
+        {
+            return std::make_unique<DefaultPoolingEngine<ElemType>>();
+        }
+    };
+
+    template<class ElemType>
+    std::unique_ptr<ConvolutionEngineFactory<ElemType>> ConvolutionEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId)
+    {
+        // REVIEW alexeyk: make cuDNN default when running on GPU and compiled with cuDNN, add config parameter to enable runtime switch between implementations.
+        //if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported())
+            return std::make_unique<CuDnnConvolutionEngineFactory<ElemType>>(deviceId);
+        //return std::make_unique<DefaultConvolutionEngineFactory<ElemType>>(deviceId);
+    }
+
+    template class ConvolutionEngineFactory<float>;
+    template class ConvolutionEngineFactory<double>;
+
 }}}
diff --git a/Math/Math/CuDnnConvolutionEngine.cpp b/Math/Math/CuDnnConvolutionEngine.cpp
@@ -10,7 +10,7 @@
 #ifdef USE_CUDNN
 #include <cudnn.h>
 
-template<> static const char* CudaErrString(cudnnStatus_t x)
+template<> const char* CudaErrString(cudnnStatus_t x)
 {
     return cudnnGetErrorString(x);
 }
@@ -202,12 +202,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     class CuDnnConvolutionEngine : public ConvolutionEngine<ElemType>
     {
     public:
-        using Tensor4D = ConvolutionTensor4D;
-        using Tensor4DPtr = std::unique_ptr<Tensor4D>;
-        using Filter = ConvolutionFilter;
-        using FilterPtr = std::unique_ptr<ConvolutionFilter>;
-        using ConvDesc = ConvolutionDescriptor;
-        using ConvDescPtr = std::unique_ptr<ConvolutionDescriptor>;
+    	using Base = ConvolutionEngine<ElemType>;
+        using typename Base::Mat;
+        using typename Base::Tensor4D;
+        using typename Base::Filter;
+        using typename Base::ConvDesc;
 
         CuDnnConvolutionEngine(DEVICEID_TYPE deviceId, size_t maxTempMemSizeInSamples)
             : m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_cudnn(nullptr), m_curMBSize(0), m_tempC(deviceId)
@@ -495,6 +494,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     class CuDnnPoolingEngine : public PoolingEngine<ElemType>
     {
+    public:
+    	using Base = PoolingEngine<ElemType>;
+        using typename Base::Tensor4D;
+        using typename Base::PoolDesc;
+        using typename Base::Mat;
+
     public:
         CuDnnPoolingEngine()
             : m_cudnn(nullptr)
@@ -546,7 +551,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     typename CuDnnConvolutionEngineFactory<ElemType>::Tensor4DPtr CuDnnConvolutionEngineFactory<ElemType>::CreateTensor(size_t w, size_t h, size_t c, size_t n)
     {
-        static_assert(false, "cuDNN engine currently supports only single and double precision tensors.");
+    	// REVIEW alexeyk: assert fires in GCC but not in VC++.
+        //static_assert(false, "cuDNN engine currently supports only single and double precision tensors.");
     }
     template<>
     typename CuDnnConvolutionEngineFactory<float>::Tensor4DPtr CuDnnConvolutionEngineFactory<float>::CreateTensor(size_t w, size_t h, size_t c, size_t n)
@@ -562,7 +568,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     typename CuDnnConvolutionEngineFactory<ElemType>::FilterPtr CuDnnConvolutionEngineFactory<ElemType>::CreateFilter(size_t w, size_t h, size_t c, size_t k)
     {
-        static_assert(false, "cuDNN engine currently supports only single and double precision filters.");
+    	// REVIEW alexeyk: assert fires in GCC but not in VC++.
+        //static_assert(false, "cuDNN engine currently supports only single and double precision filters.");
     }
     template<>
     typename CuDnnConvolutionEngineFactory<float>::FilterPtr CuDnnConvolutionEngineFactory<float>::CreateFilter(size_t w, size_t h, size_t c, size_t k)
@@ -586,7 +593,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     template<class ElemType>
     typename CuDnnConvolutionEngineFactory<ElemType>::PoolDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolDescriptor(
-        PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad)
+        typename PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad)
     {
         return std::make_unique<CuDnnPoolingDescriptor>(kind, w, h, wStride, hStride, wPad, hPad);
     }

diff --git a/Math/Math/CuDnnConvolutionEngine.h b/Math/Math/CuDnnConvolutionEngine.h
@@ -13,6 +13,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     class CuDnnConvolutionEngineFactory : public ConvolutionEngineFactory<ElemType>
     {
+    public:
+    	using Base = ConvolutionEngineFactory<ElemType>;
+        using typename Base::Tensor4D;
+        using typename Base::Tensor4DPtr;
+        using typename Base::Filter;
+        using typename Base::FilterPtr;
+        using typename Base::ConvDesc;
+        using typename Base::ConvDescPtr;
+        using typename Base::PoolDesc;
+        using typename Base::PoolDescPtr;
+
+        using typename Base::ConvEnginePtr;
+        using typename Base::PoolEnginePtr;
+
+        using Base::m_deviceId;
+
     public:
         CuDnnConvolutionEngineFactory(DEVICEID_TYPE deviceId)
             : ConvolutionEngineFactory<ElemType>(deviceId)
@@ -22,9 +38,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     public:
         Tensor4DPtr CreateTensor(size_t w, size_t h, size_t c, size_t n) override;
         FilterPtr CreateFilter(size_t w, size_t h, size_t c, size_t k) override;
-        ConvDescPtr CreateConvDescriptor(const Tensor4D& inT, const Filter& filterT, 
+        ConvDescPtr CreateConvDescriptor(const Tensor4D& inT, const Filter& filterT,
             size_t wStride, size_t hStride, bool padding) override;
-        PoolDescPtr CreatePoolDescriptor(PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) override;
+        PoolDescPtr CreatePoolDescriptor(typename PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) override;
 
         ConvEnginePtr CreateConvEngine(size_t maxTempMemSizeInSamples) override;
         PoolEnginePtr CreatePoolEngine() override;

diff --git a/configure b/configure
@@ -32,6 +32,10 @@ have_cub=no
 cub_path=
 cub_check=cub/cub.cuh
 
+have_cudnn=no
+cudnn_path=
+cudnn_check=cuda/include/cudnn.h
+
 have_opencv=no
 opencv_path=
 opencv_check=include/opencv2/opencv.hpp
@@ -49,6 +53,7 @@ default_cudas="cuda-7.5 cuda-7.0 cuda-6.5"
 default_kaldis="kaldi-trunk"
 default_gdks=". gdk/usr"
 default_cubs="cub-1.4.1"
+default_cudnns="cudnn-4.0"
 default_opencvs="opencv-3.0.0"
 
 function default_paths ()
@@ -115,6 +120,11 @@ function find_cub ()
     find_dir "$default_cubs" "$cub_check"
 }
 
+function find_cudnn ()
+{
+    find_dir "$default_cudnns" "$cudnn_check"
+}
+
 function find_opencv ()
 {
     find_dir "$default_opencvs" "$opencv_check"
@@ -162,6 +172,7 @@ function show_help ()
     echo "  --with-cuda[=directory] $(show_default $(find_cuda))"
     echo "  --with-cub[=directory] $(show_default $(find_cub))"
     echo "  --with-gdk[=directory] $(show_default $(find_gdk))"
+    echo "  --with-cudnn[=directory] $(show_default $(find_cudnn))"
     echo "  --with-acml[=directory] $(show_default $(find_acml))"
     echo "  --with-mkl[=directory] $(show_default $(find_mkl))"
     echo "  --with-buildtype=(debug|release) $(show_default $default_buildtype)"
@@ -278,6 +289,27 @@ do
                 fi
             fi
             ;;
+        --with-cudnn*)
+            have_cudnn=yes
+            if test x$optarg = x
+            then
+                cudnn_path=$(find_cudnn)
+                if test x$cudnn_path = x
+                then
+                    echo "Cannot find NVIDIA cuDNN directory."
+                    echo "Please specify a value for --with-cudnn"
+                    exit 1
+                fi
+            else
+                if test $(check_dir $optarg $cudnn_check) = yes
+                then
+                    cudnn_path=$optarg
+                else
+                    echo "Invalid cuDNN directory $optarg"
+                    exit 1
+                fi
+            fi
+            ;;
         --with-acml*)
             have_acml=yes
             mathlib=acml
@@ -448,6 +480,18 @@ then
         echo Found CUB at $cub_path
     fi
 fi
+
+if test $enable_cuda = yes && test x$cudnn_path = x
+then
+    cudnn_path=$(find_cudnn)
+    if test x$cudnn_path = x ; then
+        echo Cannot locate NVIDIA cuDNN directory
+        echo CNTK will use default convolution engine.
+    else
+        echo Found cuDNN at $cudnn_path
+    fi
+fi
+
 config=$build_top/Config.make
 echo Generating $config
 echo "#Configuration file for cntk" > $config
@@ -465,6 +509,7 @@ if test $enable_cuda = yes ; then
     echo CUDA_PATH=$cuda_path >> $config
     echo GDK_PATH=$gdk_path >> $config
     echo CUB_PATH=$cub_path >> $config
+    echo CUDNN_PATH=$cudnn_path >> $config
 fi
 if test x$kaldi_path != x ; then
     echo KALDI_PATH=$kaldi_path >> $config