Skip to content

Commit

Permalink
Integrate eldak/eldak/deterministicFlag2 into master
Browse files Browse the repository at this point in the history
  • Loading branch information
Project Philly committed Sep 1, 2016
2 parents 97e305f + 901c109 commit 1504d81
Show file tree
Hide file tree
Showing 17 changed files with 372 additions and 587 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ READER_SRC =\
COMMON_SRC =\
$(SOURCEDIR)/Common/Config.cpp \
$(SOURCEDIR)/Common/Globals.cpp \
$(SOURCEDIR)/Common/DataReader.cpp \
$(SOURCEDIR)/Common/DataWriter.cpp \
$(SOURCEDIR)/Common/ExceptionWithCallStack.cpp \
Expand Down
7 changes: 7 additions & 0 deletions Source/CNTK/CNTK.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#endif

#include "Basics.h"
#include "Globals.h"
#include "Actions.h"
#include "ComputationNetwork.h"
#include "ComputationNode.h"
Expand Down Expand Up @@ -480,6 +481,9 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
let valp = BS::Evaluate(expr); // evaluate parse into a dictionary
let& config = valp.AsRef<ScriptableObjects::IConfigRecord>(); // this is the dictionary

if (config(L"forceDeterministicAlgorithms", false))
Globals::ForceDeterministicAlgorithms();

#ifndef CPUONLY
auto valpp = config.Find(L"deviceId");
if (valpp)
Expand Down Expand Up @@ -613,6 +617,9 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
ProgressTracing::SetTimestampingFlag();
}

if (config(L"forceDeterministicAlgorithms", false))
Globals::ForceDeterministicAlgorithms();

// get the command param set they want
wstring logpath = config(L"stderr", L"");

Expand Down
1 change: 1 addition & 0 deletions Source/CNTK/CNTK.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@
<ClInclude Include="..\Common\Include\BestGpu.h" />
<ClInclude Include="..\Common\Include\DataReader.h" />
<ClInclude Include="..\Common\Include\ExceptionWithCallStack.h" />
<ClInclude Include="..\Common\Include\Globals.h" />
<ClInclude Include="..\Common\Include\StringUtil.h" />
<ClInclude Include="..\Common\Include\TensorShape.h" />
<ClInclude Include="..\Common\Include\DataWriter.h" />
Expand Down
3 changes: 3 additions & 0 deletions Source/CNTK/CNTK.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,9 @@
</ClInclude>
<ClInclude Include="..\Common\Include\basetypes.h" />
<ClInclude Include="..\Readers\CompositeDataReader\CompositeDataReader.h" />
<ClInclude Include="..\Common\Include\Globals.h">
<Filter>Common\Include</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<Text Include="modelEditor.txt">
Expand Down
3 changes: 2 additions & 1 deletion Source/Common/Common.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,10 @@
<ClCompile Include="ExceptionWithCallStack.cpp" />
<ClCompile Include="File.cpp" />
<ClCompile Include="fileutil.cpp" />
<ClCompile Include="Globals.cpp" />
<ClCompile Include="MPIWrapper.cpp" />
<ClCompile Include="TimerUtility.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" />
</Project>
</Project>
14 changes: 14 additions & 0 deletions Source/Common/Globals.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//

#include "Globals.h"

using namespace std;

namespace Microsoft { namespace MSR { namespace CNTK {

std::atomic<bool> Globals::m_forceDeterministicAlgorithms(false);

}}}
29 changes: 29 additions & 0 deletions Source/Common/Include/Globals.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//

#pragma once

#include <atomic>

namespace Microsoft { namespace MSR { namespace CNTK {

// Class containing global configuration for CNTK.
class Globals
{
public:
static void ForceDeterministicAlgorithms()
{
m_forceDeterministicAlgorithms = true;
}

static bool ShouldForceDeterministicAlgorithms()
{
return m_forceDeterministicAlgorithms;
}

private:
static std::atomic<bool> m_forceDeterministicAlgorithms;
};
}}}
3 changes: 2 additions & 1 deletion Source/ComputationNetworkLib/ConvolutionalNodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#pragma once

#include "Basics.h"
#include "Globals.h"
#include "Matrix.h"
#include "ComputationNode.h"
#include "ConvolutionEngine.h"
Expand Down Expand Up @@ -452,7 +453,7 @@ class ConvolutionNode : public ConvolutionNodeBase<ElemType>, public NumInputs<2
m_sharing, m_autoPad, m_lowerPad, m_upperPad);
m_convEng = ConvolutionEngine<ElemType>::Create(geometry, m_deviceId, m_imageLayout,
m_maxTempMemSizeInSamples, m_poolKind,
ConvolutionEngineKind::All, NodeName());
ConvolutionEngineKind::All, NodeName(), Globals::ShouldForceDeterministicAlgorithms());
}

if (Input(0)->GetSampleLayout().GetNumElements() != m_kernelShape.GetNumElements() * m_convEng->Geometry()->KernelCount())
Expand Down
4 changes: 2 additions & 2 deletions Source/Math/ConvolutionEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,7 @@ class GemmConvolutionEngine : public ReferenceConvolutionEngine<ElemType>
template <class ElemType>
std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind,
ConvolutionEngineKind enabledEngines, std::wstring logPrefix)
ConvolutionEngineKind enabledEngines, std::wstring logPrefix, bool forceDeterministicAlgorithms)
{
if (!logPrefix.empty())
logPrefix += L": ";
Expand All @@ -875,7 +875,7 @@ std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create
CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId, geometry, poolKind))
{
fprintf(stderr, "%lsusing cuDNN convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
return CuDnnConvolutionEngineFactory<ElemType>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
return CuDnnConvolutionEngineFactory<ElemType>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind, forceDeterministicAlgorithms);
}

if (isEnabled(ConvolutionEngineKind::Gemm) && GemmConvolutionEngine<ElemType>::IsSupported(deviceId, geometry))
Expand Down
6 changes: 3 additions & 3 deletions Source/Math/ConvolutionEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ class MATH_API ConvolutionEngine

std::shared_ptr<const ConvolveGeometry> Geometry() const { return m_geometry; }

static std::unique_ptr<ConvolutionEngine<ElemType>> Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout,
size_t maxTempMemSizeInSamples, PoolKind poolKind = PoolKind::None,
static std::unique_ptr<ConvolutionEngine<ElemType>> Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind = PoolKind::None,
ConvolutionEngineKind enabledEngines = ConvolutionEngineKind::All,
std::wstring logPrefix = L"");
std::wstring logPrefix = L"", bool forceDeterministicAlgorithms = false);

DISABLE_COPY_AND_MOVE(ConvolutionEngine);

Expand Down
79 changes: 60 additions & 19 deletions Source/Math/CuDnnConvolutionEngine.cu
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ private:
class CuDnnPool
{
public:
CuDnnPool(const ConvolveGeometry& geometry, PoolKind kind)
CuDnnPool(const ConvolveGeometry& geometry, PoolKind kind, bool forceDeterministicAlgorithms)
: m_pool(nullptr)
{
assert(kind == PoolKind::Max || kind == PoolKind::Average);
Expand All @@ -139,7 +139,7 @@ public:

// Must use CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING to get the same results as in reference engine.
CUDNN_CALL(cudnnSetPoolingNdDescriptor(m_pool,
kind == PoolKind::Max ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
kind == PoolKind::Max && !forceDeterministicAlgorithms ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
CUDNN_PROPAGATE_NAN,
(int)dims.size(), dims.data(), pad.data(), stride.data()));
}
Expand Down Expand Up @@ -173,12 +173,13 @@ public:

public:
CuDnnConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout,
size_t maxTempMemSizeInSamples, PoolKind poolKind)
size_t maxTempMemSizeInSamples, PoolKind poolKind, bool forceDeterministicAlgorithms)
: Base(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind),
m_cudnn(CuDnn::Instance()),
m_dataType(CuDnnTensor::GetDataType<ElemType>()),
m_inT(geometry->InputShape(), m_dataType),
m_outT(geometry->OutputShape(), m_dataType)
m_outT(geometry->OutputShape(), m_dataType),
m_forceDeterministicAlgorithms(forceDeterministicAlgorithms)
{
}

Expand Down Expand Up @@ -212,7 +213,17 @@ protected:
// Find best algo and allocate temp buffer, if needed.
auto finder = [this](int& calgo, cudnnConvolutionFwdAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
{
return cudnnFindConvolutionForwardAlgorithm(*m_cudnn, m_inT, *m_kernelT, *m_conv, m_outT, MaxAlgoCount, &calgo, algoPerf);
auto result = cudnnFindConvolutionForwardAlgorithm(*m_cudnn, m_inT, *m_kernelT, *m_conv, m_outT, MaxAlgoCount, &calgo, algoPerf);
if (m_forceDeterministicAlgorithms)
{
auto found = std::find_if(algoPerf, algoPerf + calgo,
[](const cudnnConvolutionFwdAlgoPerf_t& a) { return a.algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM && a.status == CUDNN_STATUS_SUCCESS; });
if (found == algoPerf + calgo)
RuntimeError("cuDNN could not find a deterministic algorithm. Set 'forceDeterministicAlgorithms=false' in your configuration.");
calgo = 1;
algoPerf[0] = *found;
}
return result;
};
auto staticFinder = [this](cudnnConvolutionFwdAlgo_t& algo) -> cudnnStatus_t
{
Expand All @@ -228,6 +239,8 @@ protected:
// REVIEW alexeyk: NVIDIA is currently reviewing this issue.
if (CUDNN_STATUS_INVALID_VALUE == err && m_fwdAlgo.Algo.memory > 0)
{
if (m_forceDeterministicAlgorithms)
RuntimeError("Falling back of the algorithms is not allowed. Please set 'forceDeterministicAlgorithms=false'.");
auto err2 = cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv,
m_fwdAlgo.NoWorkspaceAlgo, nullptr, 0, &C::Zero, m_outT, ptr(out));
// Update original error in case of success.
Expand All @@ -243,7 +256,17 @@ protected:
// Find best algo and allocate temp buffer, if needed.
auto finder = [this](int& calgo, cudnnConvolutionBwdDataAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
{
return cudnnFindConvolutionBackwardDataAlgorithm(*m_cudnn, *m_kernelT, m_outT, *m_conv, m_inT, MaxAlgoCount, &calgo, algoPerf);
auto result = cudnnFindConvolutionBackwardDataAlgorithm(*m_cudnn, *m_kernelT, m_outT, *m_conv, m_inT, MaxAlgoCount, &calgo, algoPerf);
if (m_forceDeterministicAlgorithms)
{
auto found = std::find_if(algoPerf, algoPerf + calgo,
[](const cudnnConvolutionBwdDataAlgoPerf_t& a) { return a.algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 && a.status == CUDNN_STATUS_SUCCESS; });
if (found == algoPerf + calgo)
RuntimeError("cuDNN could not find a deterministic algorithm. Set 'forceDeterministicAlgorithms=false' in your configuration.");
calgo = 1;
algoPerf[0] = *found;
}
return result;
};
auto staticFinder = [this](cudnnConvolutionBwdDataAlgo_t& algo) -> cudnnStatus_t
{
Expand All @@ -263,7 +286,17 @@ protected:
// Find best algo and allocate temp buffer, if needed.
auto finder = [this](int& calgo, cudnnConvolutionBwdFilterAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
{
return cudnnFindConvolutionBackwardFilterAlgorithm(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, MaxAlgoCount, &calgo, algoPerf);
auto result = cudnnFindConvolutionBackwardFilterAlgorithm(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, MaxAlgoCount, &calgo, algoPerf);
if (m_forceDeterministicAlgorithms)
{
auto found = std::find_if(algoPerf, algoPerf + calgo,
[](const cudnnConvolutionBwdFilterAlgoPerf_t& a) { return a.algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 && a.status == CUDNN_STATUS_SUCCESS; });
if (found == algoPerf + calgo)
RuntimeError("cuDNN could not find a deterministic algorithm. Set 'forceDeterministicAlgorithms=false' in your configuration.");
calgo = 1;
algoPerf[0] = *found;
}
return result;
};
auto staticFinder = [this](cudnnConvolutionBwdFilterAlgo_t& algo) -> cudnnStatus_t
{
Expand All @@ -280,7 +313,7 @@ protected:
void EnsurePoolingInitialized() override
{
if (m_pool == nullptr)
m_pool = std::make_unique<CuDnnPool>(*m_geometry, m_poolKind);
m_pool = std::make_unique<CuDnnPool>(*m_geometry, m_poolKind, m_forceDeterministicAlgorithms);
}

void ForwardPoolingCore(const Mat& in, Mat& out) override
Expand Down Expand Up @@ -329,10 +362,14 @@ private:
cudnnStatus_t err = finder(calgo, algoPerf);
// Alloc failed - usually means cuDNN runtime auto-tuner could not allocate workspace.
// In such case, use static auto-tuner with no workspace.
// This should never happen in the deterministic mode because we pick up algorithms with 0 memory workspace.
if (err == CUDNN_STATUS_ALLOC_FAILED)
{
decltype(CuDnnAlgoT::algo) noMemAlgo;
CUDNN_CALL(staticFinder(noMemAlgo));
if (m_forceDeterministicAlgorithms)
RuntimeError("cuDNN could not find a deterministic algorithm. Set 'forceDeterministicAlgorithms=false' in your configuration.");

algo.MaxAllowedMBSizeForCurrentAlgo = batchSize;
algo.Algo = algoPerf[0];
algo.Algo.algo = noMemAlgo;
Expand All @@ -347,20 +384,20 @@ private:
size_t maxMem = m_maxTempMemSizeInSamples == 0 ? (std::numeric_limits<size_t>::max)() : inputSampleSize * m_maxTempMemSizeInSamples * sizeof(ElemType);
// Find best (fastest) algorithm which satisfies workspace requirements.
auto res = std::find_if(algoPerf, algoPerf + calgo,
[=](const CuDnnAlgoT& cur)
{
return cur.status == CUDNN_STATUS_SUCCESS && cur.memory <= maxMem;
});
[=](const CuDnnAlgoT& cur) { return cur.status == CUDNN_STATUS_SUCCESS && cur.memory <= maxMem; });

if (res == algoPerf + calgo)
RuntimeError("cuDNN could not find suitable algorithm for the current convolution configuration.");
algo.MaxAllowedMBSizeForCurrentAlgo = batchSize;
algo.Algo = *res;

if (m_forceDeterministicAlgorithms) // does not allow fallback.
return;

// Find fastest algorithm that does NOT require workspace. It is used as a fallback algo in Forward function.
res = std::find_if(algoPerf, algoPerf + calgo,
[](const CuDnnAlgoT& cur)
{
return cur.status == CUDNN_STATUS_SUCCESS && cur.memory == 0;
});
// Currently all Forward algorithms are deterministic, so no need for checking.
res = std::find_if(algoPerf, algoPerf + calgo,
[](const CuDnnAlgoT& cur) { return cur.status == CUDNN_STATUS_SUCCESS && cur.memory == 0; });
if (res == algoPerf + calgo)
{
// In theory, this should never happen.
Expand Down Expand Up @@ -423,14 +460,18 @@ private:
ConvAlgoInfo<cudnnConvolutionFwdAlgoPerf_t> m_fwdAlgo;
ConvAlgoInfo<cudnnConvolutionBwdDataAlgoPerf_t> m_backDataAlgo;
ConvAlgoInfo<cudnnConvolutionBwdFilterAlgoPerf_t> m_backFiltAlgo;

// Flag indicating whether only deterministic algorithms should be used.
bool m_forceDeterministicAlgorithms;
};

template <class ElemType>
std::unique_ptr<ConvolutionEngine<ElemType>> CuDnnConvolutionEngineFactory<ElemType>::Create(ConvolveGeometryPtr geometry,
DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout,
size_t maxTempMemSizeInSamples, PoolKind poolKind)
size_t maxTempMemSizeInSamples, PoolKind poolKind,
bool forceDeterministicAlgorithms)
{
return std::make_unique<CuDnnConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
return std::make_unique<CuDnnConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind, forceDeterministicAlgorithms);
}

template <class ElemType>
Expand Down
2 changes: 1 addition & 1 deletion Source/Math/CuDnnFactories.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class CuDnnConvolutionEngineFactory
public:
static std::unique_ptr<ConvolutionEngine<ElemType>> Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
PoolKind poolKind);
PoolKind poolKind, bool forceDeterministicAlgorithms);
static bool IsSupported(DEVICEID_TYPE deviceId, ConvolveGeometryPtr geometry, PoolKind poolKind);
};

Expand Down
2 changes: 1 addition & 1 deletion Source/Math/NoGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2255,7 +2255,7 @@ void* GPUMatrix<ElemType>::s_curandGenerator = NULL;

template <class ElemType>
std::unique_ptr<ConvolutionEngine<ElemType>> CuDnnConvolutionEngineFactory<ElemType>::Create(ConvolveGeometryPtr, DEVICEID_TYPE,
ImageLayoutKind, size_t, PoolKind)
ImageLayoutKind, size_t, PoolKind, bool)
{
RuntimeError("The code is compiled with CPUONLY macro.");
}
Expand Down
Loading

0 comments on commit 1504d81

Please sign in to comment.