Integrate eldak/eldak/deterministicFlag2 into master

henrykironde · Sep 1, 2016 · 1504d81 · 1504d81
2 parents 97e305f + 901c109
commit 1504d81
Show file tree

Hide file tree

Showing 17 changed files with 372 additions and 587 deletions.
diff --git a/Makefile b/Makefile
@@ -265,6 +265,7 @@ READER_SRC =\
 
 COMMON_SRC =\
 	$(SOURCEDIR)/Common/Config.cpp \
+	$(SOURCEDIR)/Common/Globals.cpp \
 	$(SOURCEDIR)/Common/DataReader.cpp \
 	$(SOURCEDIR)/Common/DataWriter.cpp \
 	$(SOURCEDIR)/Common/ExceptionWithCallStack.cpp \

diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp
@@ -13,6 +13,7 @@
 #endif 
 
 #include "Basics.h"
+#include "Globals.h"
 #include "Actions.h"
 #include "ComputationNetwork.h"
 #include "ComputationNode.h"
@@ -480,6 +481,9 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
     let valp = BS::Evaluate(expr);                                // evaluate parse into a dictionary
     let& config = valp.AsRef<ScriptableObjects::IConfigRecord>(); // this is the dictionary
 
+    if (config(L"forceDeterministicAlgorithms", false))
+        Globals::ForceDeterministicAlgorithms();
+
 #ifndef CPUONLY
     auto valpp = config.Find(L"deviceId");
     if (valpp)
@@ -613,6 +617,9 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
         ProgressTracing::SetTimestampingFlag();
     }
 
+    if (config(L"forceDeterministicAlgorithms", false))
+        Globals::ForceDeterministicAlgorithms();
+
     // get the command param set they want
     wstring logpath = config(L"stderr", L"");
 

diff --git a/Source/CNTK/CNTK.vcxproj b/Source/CNTK/CNTK.vcxproj
@@ -149,6 +149,7 @@
     <ClInclude Include="..\Common\Include\BestGpu.h" />
     <ClInclude Include="..\Common\Include\DataReader.h" />
     <ClInclude Include="..\Common\Include\ExceptionWithCallStack.h" />
+    <ClInclude Include="..\Common\Include\Globals.h" />
     <ClInclude Include="..\Common\Include\StringUtil.h" />
     <ClInclude Include="..\Common\Include\TensorShape.h" />
     <ClInclude Include="..\Common\Include\DataWriter.h" />

diff --git a/Source/CNTK/CNTK.vcxproj.filters b/Source/CNTK/CNTK.vcxproj.filters
@@ -174,6 +174,9 @@
     </ClInclude>
     <ClInclude Include="..\Common\Include\basetypes.h" />
     <ClInclude Include="..\Readers\CompositeDataReader\CompositeDataReader.h" />
+    <ClInclude Include="..\Common\Include\Globals.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <Text Include="modelEditor.txt">

diff --git a/Source/Common/Common.vcxproj b/Source/Common/Common.vcxproj
@@ -65,9 +65,10 @@
     <ClCompile Include="ExceptionWithCallStack.cpp" />
     <ClCompile Include="File.cpp" />
     <ClCompile Include="fileutil.cpp" />
+    <ClCompile Include="Globals.cpp" />
     <ClCompile Include="MPIWrapper.cpp" />
     <ClCompile Include="TimerUtility.cpp" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets" />
-</Project>
+</Project>
diff --git a/Source/Common/Globals.cpp b/Source/Common/Globals.cpp
@@ -0,0 +1,14 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "Globals.h"
+
+using namespace std;
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    std::atomic<bool> Globals::m_forceDeterministicAlgorithms(false);
+
+}}}
diff --git a/Source/Common/Include/Globals.h b/Source/Common/Include/Globals.h
@@ -0,0 +1,29 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include <atomic>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    // Class containing global configuration for CNTK.
+    class Globals
+    {
+    public:
+        static void ForceDeterministicAlgorithms()
+        {
+            m_forceDeterministicAlgorithms = true;
+        }
+
+        static bool ShouldForceDeterministicAlgorithms()
+        {
+            return m_forceDeterministicAlgorithms;
+        }
+
+    private:
+        static std::atomic<bool> m_forceDeterministicAlgorithms;
+    };
+}}}
diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@@ -5,6 +5,7 @@
 #pragma once
 
 #include "Basics.h"
+#include "Globals.h"
 #include "Matrix.h"
 #include "ComputationNode.h"
 #include "ConvolutionEngine.h"
@@ -452,7 +453,7 @@ class ConvolutionNode : public ConvolutionNodeBase<ElemType>, public NumInputs<2
                                                                    m_sharing, m_autoPad, m_lowerPad, m_upperPad);
                 m_convEng = ConvolutionEngine<ElemType>::Create(geometry, m_deviceId, m_imageLayout,
                                                                 m_maxTempMemSizeInSamples, m_poolKind,
-                                                                ConvolutionEngineKind::All, NodeName());
+                                                                ConvolutionEngineKind::All, NodeName(), Globals::ShouldForceDeterministicAlgorithms());
             }
 
             if (Input(0)->GetSampleLayout().GetNumElements() != m_kernelShape.GetNumElements() * m_convEng->Geometry()->KernelCount())

diff --git a/Source/Math/ConvolutionEngine.cpp b/Source/Math/ConvolutionEngine.cpp
@@ -850,7 +850,7 @@ class GemmConvolutionEngine : public ReferenceConvolutionEngine<ElemType>
 template <class ElemType>
 std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
                                                                                  ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind,
-                                                                                 ConvolutionEngineKind enabledEngines, std::wstring logPrefix)
+                                                                                 ConvolutionEngineKind enabledEngines, std::wstring logPrefix, bool forceDeterministicAlgorithms)
 {
     if (!logPrefix.empty())
         logPrefix += L": ";
@@ -875,7 +875,7 @@ std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create
         CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId, geometry, poolKind))
     {
         fprintf(stderr, "%lsusing cuDNN convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
-        return CuDnnConvolutionEngineFactory<ElemType>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
+        return CuDnnConvolutionEngineFactory<ElemType>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind, forceDeterministicAlgorithms);
     }
 
     if (isEnabled(ConvolutionEngineKind::Gemm) && GemmConvolutionEngine<ElemType>::IsSupported(deviceId, geometry))

diff --git a/Source/Math/ConvolutionEngine.h b/Source/Math/ConvolutionEngine.h
@@ -59,10 +59,10 @@ class MATH_API ConvolutionEngine
 
     std::shared_ptr<const ConvolveGeometry> Geometry() const { return m_geometry; }
 
-    static std::unique_ptr<ConvolutionEngine<ElemType>> Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout,
-                                                               size_t maxTempMemSizeInSamples, PoolKind poolKind = PoolKind::None, 
+    static std::unique_ptr<ConvolutionEngine<ElemType>> Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, 
+                                                               ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind = PoolKind::None,
                                                                ConvolutionEngineKind enabledEngines = ConvolutionEngineKind::All,
-                                                               std::wstring logPrefix = L"");
+                                                               std::wstring logPrefix = L"", bool forceDeterministicAlgorithms = false);
 
     DISABLE_COPY_AND_MOVE(ConvolutionEngine);
 

diff --git a/Source/Math/CuDnnConvolutionEngine.cu b/Source/Math/CuDnnConvolutionEngine.cu
@@ -117,7 +117,7 @@ private:
 class CuDnnPool
 {
 public:
-    CuDnnPool(const ConvolveGeometry& geometry, PoolKind kind)
+    CuDnnPool(const ConvolveGeometry& geometry, PoolKind kind, bool forceDeterministicAlgorithms)
         : m_pool(nullptr)
     {
         assert(kind == PoolKind::Max || kind == PoolKind::Average);
@@ -139,7 +139,7 @@ public:
 
         // Must use CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING to get the same results as in reference engine.
         CUDNN_CALL(cudnnSetPoolingNdDescriptor(m_pool,
-                                               kind == PoolKind::Max ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
+                                               kind == PoolKind::Max && !forceDeterministicAlgorithms ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
                                                CUDNN_PROPAGATE_NAN,
                                                (int)dims.size(), dims.data(), pad.data(), stride.data()));
     }
@@ -173,12 +173,13 @@ public:
 
 public:
     CuDnnConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout,
-                           size_t maxTempMemSizeInSamples, PoolKind poolKind)
+                           size_t maxTempMemSizeInSamples, PoolKind poolKind, bool forceDeterministicAlgorithms)
                            : Base(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind),
                            m_cudnn(CuDnn::Instance()),
                            m_dataType(CuDnnTensor::GetDataType<ElemType>()),
                            m_inT(geometry->InputShape(), m_dataType),
-                           m_outT(geometry->OutputShape(), m_dataType)
+                           m_outT(geometry->OutputShape(), m_dataType),
+                           m_forceDeterministicAlgorithms(forceDeterministicAlgorithms)
     {
     }
 
@@ -212,7 +213,17 @@ protected:
         // Find best algo and allocate temp buffer, if needed.
         auto finder = [this](int& calgo, cudnnConvolutionFwdAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
         {
-            return cudnnFindConvolutionForwardAlgorithm(*m_cudnn, m_inT, *m_kernelT, *m_conv, m_outT, MaxAlgoCount, &calgo, algoPerf);
+            auto result = cudnnFindConvolutionForwardAlgorithm(*m_cudnn, m_inT, *m_kernelT, *m_conv, m_outT, MaxAlgoCount, &calgo, algoPerf);
+            if (m_forceDeterministicAlgorithms)
+            {
+                auto found = std::find_if(algoPerf, algoPerf + calgo,
+                    [](const cudnnConvolutionFwdAlgoPerf_t& a)  { return a.algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM && a.status == CUDNN_STATUS_SUCCESS; });
+                if (found == algoPerf + calgo)
+                    RuntimeError("cuDNN could not find a deterministic algorithm. Set 'forceDeterministicAlgorithms=false' in your configuration.");
+                calgo = 1;
+                algoPerf[0] = *found;
+            }
+            return result;
         };
         auto staticFinder = [this](cudnnConvolutionFwdAlgo_t& algo) -> cudnnStatus_t
         {
@@ -228,6 +239,8 @@ protected:
         // REVIEW alexeyk: NVIDIA is currently reviewing this issue.
         if (CUDNN_STATUS_INVALID_VALUE == err && m_fwdAlgo.Algo.memory > 0)
         {
+            if (m_forceDeterministicAlgorithms)
+                RuntimeError("Falling back of the algorithms is not allowed. Please set 'forceDeterministicAlgorithms=false'.");
             auto err2 = cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv,
                                                 m_fwdAlgo.NoWorkspaceAlgo, nullptr, 0, &C::Zero, m_outT, ptr(out));
             // Update original error in case of success.
@@ -243,7 +256,17 @@ protected:
         // Find best algo and allocate temp buffer, if needed.
         auto finder = [this](int& calgo, cudnnConvolutionBwdDataAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
         {
-            return cudnnFindConvolutionBackwardDataAlgorithm(*m_cudnn, *m_kernelT, m_outT, *m_conv, m_inT, MaxAlgoCount, &calgo, algoPerf);
+            auto result = cudnnFindConvolutionBackwardDataAlgorithm(*m_cudnn, *m_kernelT, m_outT, *m_conv, m_inT, MaxAlgoCount, &calgo, algoPerf);
+            if (m_forceDeterministicAlgorithms)
+            {
+                auto found = std::find_if(algoPerf, algoPerf + calgo,
+                    [](const cudnnConvolutionBwdDataAlgoPerf_t& a)  { return a.algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 && a.status == CUDNN_STATUS_SUCCESS; });
+                if (found == algoPerf + calgo)
+                    RuntimeError("cuDNN could not find a deterministic algorithm. Set 'forceDeterministicAlgorithms=false' in your configuration.");
+                calgo = 1;
+                algoPerf[0] = *found;
+            }
+            return result;
         };
         auto staticFinder = [this](cudnnConvolutionBwdDataAlgo_t& algo) -> cudnnStatus_t
         {
@@ -263,7 +286,17 @@ protected:
         // Find best algo and allocate temp buffer, if needed.
         auto finder = [this](int& calgo, cudnnConvolutionBwdFilterAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
         {
-            return cudnnFindConvolutionBackwardFilterAlgorithm(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, MaxAlgoCount, &calgo, algoPerf);
+            auto result = cudnnFindConvolutionBackwardFilterAlgorithm(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, MaxAlgoCount, &calgo, algoPerf);
+            if (m_forceDeterministicAlgorithms)
+            {
+                auto found = std::find_if(algoPerf, algoPerf + calgo,
+                    [](const cudnnConvolutionBwdFilterAlgoPerf_t& a)  { return a.algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 && a.status == CUDNN_STATUS_SUCCESS; });
+                if (found == algoPerf + calgo)
+                    RuntimeError("cuDNN could not find a deterministic algorithm. Set 'forceDeterministicAlgorithms=false' in your configuration.");
+                calgo = 1;
+                algoPerf[0] = *found;
+            }
+            return result;
         };
         auto staticFinder = [this](cudnnConvolutionBwdFilterAlgo_t& algo) -> cudnnStatus_t
         {
@@ -280,7 +313,7 @@ protected:
     void EnsurePoolingInitialized() override
     {
         if (m_pool == nullptr)
-            m_pool = std::make_unique<CuDnnPool>(*m_geometry, m_poolKind);
+            m_pool = std::make_unique<CuDnnPool>(*m_geometry, m_poolKind, m_forceDeterministicAlgorithms);
     }
 
     void ForwardPoolingCore(const Mat& in, Mat& out) override
@@ -329,10 +362,14 @@ private:
         cudnnStatus_t err = finder(calgo, algoPerf);
         // Alloc failed - usually means cuDNN runtime auto-tuner could not allocate workspace.
         // In such case, use static auto-tuner with no workspace.
+        // This should never happen in the deterministic mode because we pick up algorithms with 0 memory workspace.
         if (err == CUDNN_STATUS_ALLOC_FAILED)
         {
             decltype(CuDnnAlgoT::algo) noMemAlgo;
             CUDNN_CALL(staticFinder(noMemAlgo));
+            if (m_forceDeterministicAlgorithms)
+                RuntimeError("cuDNN could not find a deterministic algorithm. Set 'forceDeterministicAlgorithms=false' in your configuration.");
+
             algo.MaxAllowedMBSizeForCurrentAlgo = batchSize;
             algo.Algo = algoPerf[0];
             algo.Algo.algo = noMemAlgo;
@@ -347,20 +384,20 @@ private:
         size_t maxMem = m_maxTempMemSizeInSamples == 0 ? (std::numeric_limits<size_t>::max)() : inputSampleSize * m_maxTempMemSizeInSamples * sizeof(ElemType);
         // Find best (fastest) algorithm which satisfies workspace requirements.
         auto res = std::find_if(algoPerf, algoPerf + calgo,
-            [=](const CuDnnAlgoT& cur)
-            {
-                return cur.status == CUDNN_STATUS_SUCCESS && cur.memory <= maxMem;
-            });
+            [=](const CuDnnAlgoT& cur) { return cur.status == CUDNN_STATUS_SUCCESS && cur.memory <= maxMem; });
+
         if (res == algoPerf + calgo)
             RuntimeError("cuDNN could not find suitable algorithm for the current convolution configuration.");
         algo.MaxAllowedMBSizeForCurrentAlgo = batchSize;
         algo.Algo = *res;
+
+        if (m_forceDeterministicAlgorithms) // does not allow fallback.
+            return;
+
         // Find fastest algorithm that does NOT require workspace. It is used as a fallback algo in Forward function.
-        res = std::find_if(algoPerf, algoPerf + calgo,
-            [](const CuDnnAlgoT& cur)
-            {
-                return cur.status == CUDNN_STATUS_SUCCESS && cur.memory == 0;
-            });
+        // Currently all Forward algorithms are deterministic, so no need for checking.
+        res = std::find_if(algoPerf, algoPerf + calgo, 
+            [](const CuDnnAlgoT& cur) { return cur.status == CUDNN_STATUS_SUCCESS && cur.memory == 0; });
         if (res == algoPerf + calgo)
         {
             // In theory, this should never happen.
@@ -423,14 +460,18 @@ private:
     ConvAlgoInfo<cudnnConvolutionFwdAlgoPerf_t> m_fwdAlgo;
     ConvAlgoInfo<cudnnConvolutionBwdDataAlgoPerf_t> m_backDataAlgo;
     ConvAlgoInfo<cudnnConvolutionBwdFilterAlgoPerf_t> m_backFiltAlgo;
+
+    // Flag indicating whether only deterministic algorithms should be used.
+    bool m_forceDeterministicAlgorithms;
 };
 
 template <class ElemType>
 std::unique_ptr<ConvolutionEngine<ElemType>> CuDnnConvolutionEngineFactory<ElemType>::Create(ConvolveGeometryPtr geometry,
                                                                                              DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout,
-                                                                                             size_t maxTempMemSizeInSamples, PoolKind poolKind)
+                                                                                             size_t maxTempMemSizeInSamples, PoolKind poolKind,
+                                                                                             bool forceDeterministicAlgorithms)
 {
-    return std::make_unique<CuDnnConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
+    return std::make_unique<CuDnnConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind, forceDeterministicAlgorithms);
 }
 
 template <class ElemType>

diff --git a/Source/Math/CuDnnFactories.h b/Source/Math/CuDnnFactories.h
@@ -16,7 +16,7 @@ class CuDnnConvolutionEngineFactory
 public:
     static std::unique_ptr<ConvolutionEngine<ElemType>> Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
                                                                ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
-                                                               PoolKind poolKind);
+                                                               PoolKind poolKind, bool forceDeterministicAlgorithms);
     static bool IsSupported(DEVICEID_TYPE deviceId, ConvolveGeometryPtr geometry, PoolKind poolKind);
 };
 

diff --git a/Source/Math/NoGPU.cpp b/Source/Math/NoGPU.cpp
@@ -2255,7 +2255,7 @@ void* GPUMatrix<ElemType>::s_curandGenerator = NULL;
 
 template <class ElemType>
 std::unique_ptr<ConvolutionEngine<ElemType>> CuDnnConvolutionEngineFactory<ElemType>::Create(ConvolveGeometryPtr, DEVICEID_TYPE,
-                                                                                             ImageLayoutKind, size_t, PoolKind)
+                                                                                             ImageLayoutKind, size_t, PoolKind, bool)
 {
     RuntimeError("The code is compiled with CPUONLY macro.");
 }