Skip to content

Commit

Permalink
Switch to CuDNN v5
Browse files Browse the repository at this point in the history
For batch normalization, running inverse standard deviation becomes
running variance. We mirror this CuDNN v5 change in the CNTK batch
normalization engine. Model version is bumped. When old models are
loaded, this parameter is (approximately) converted.

In the same model version change, let batch normalization count
samples seen rather minibatches (this deals with incorrect averaging
when minibatch size is varied across epochs).

For batch normalization averaging and blending handle initialization
cases, don't rely on mean and variance initial values (set in
NDL/BrainScript).

Update Windows / Linux / Docker build.
With this commit, CuDNN v4 is not supported anymore.
  • Loading branch information
mahilleb-msft committed Aug 22, 2016
1 parent 46a10ad commit f76afa2
Show file tree
Hide file tree
Showing 62 changed files with 3,293 additions and 541 deletions.
2 changes: 2 additions & 0 deletions CNTK.Cpp.props
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDNN_PATH>C:\NVIDIA\cudnn-5.0\cuda</CUDNN_PATH>

<!-- Note: SolutionDir / RepoRootPath are the same in current setup -->
<RepoRootPath>$(MSBuildThisFileDirectory)</RepoRootPath>
<RelativeProjectPath>$(MSBuildProjectDirectory.Substring($(MSBuildThisFileDirectory.Length)))</RelativeProjectPath>
Expand Down
8 changes: 4 additions & 4 deletions Examples/Image/MNIST/Config/Macros.ndl
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
bn = BatchNormalization(t, sc, b, m, var, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
y = RectifiedLinear(bn)
]

Expand Down Expand Up @@ -72,10 +72,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
b = LearnableParameter(outMap, 1, init=fixedValue, value=bValue)
sc = LearnableParameter(outMap, 1, init=fixedValue, value=scValue)
m = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
isd = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
var = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)

c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true, imageLayout=$imageLayout$)
y = BatchNormalization(c, sc, b, m, isd, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst, imageLayout=$imageLayout$)
y = BatchNormalization(c, sc, b, m, var, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst, imageLayout=$imageLayout$)
]

ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [
Expand Down
8 changes: 4 additions & 4 deletions Examples/Image/MNIST/Config/Shared.bs
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ DnnBNReLULayer (inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
b = Parameter (outDim, 1, init = "fixedValue", value = bValue)
sc = Parameter (outDim, 1, init = "fixedValue", value = scValue)
m = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
isd = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
var = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
t = Times(W, x) # TODO: W * x
bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
bn = BatchNormalization(t, sc, b, m, var, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
y = RectifiedLinear(bn)
].y

Expand Down Expand Up @@ -61,10 +61,10 @@ ConvBNLayerW (W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeC
b = Parameter(outMap, 1, init="fixedValue", value=bValue)
sc = Parameter(outMap, 1, init="fixedValue", value=scValue)
m = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
isd = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
var = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)

c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true /* , imageLayout=$imageLayout$*/)
y = BatchNormalization(c, sc, b, m, isd, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst /* , imageLayout=$imageLayout$*/)
y = BatchNormalization(c, sc, b, m, var, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst /* , imageLayout=$imageLayout$*/)
].y

ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [
Expand Down
16 changes: 8 additions & 8 deletions Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)

c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
y = BatchNormalization(c, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
]

ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
Expand All @@ -44,10 +44,10 @@ ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)

c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
y = BatchNormalization(c, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
]

ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
Expand Down Expand Up @@ -113,9 +113,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, isd, spatial = false, normalizationTimeConstant = bnTimeConst)
bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst)
y = RectifiedLinear(bn)
]

Expand All @@ -125,9 +125,9 @@ DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeCon
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, isd, spatial = false, normalizationTimeConstant = bnTimeConst)
bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst)
y = RectifiedLinear(bn)
]

Expand Down
4 changes: 2 additions & 2 deletions Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ BN(inp, mapCount, bValue, scValue, bnTimeConst)
b = Parameter(mapCount, 1, init = fixedValue, value = bValue)
sc = Parameter(mapCount, 1, init = fixedValue, value = scValue)
m = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)

y = BatchNormalization(inp, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
y = BatchNormalization(inp, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
]

ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
Expand Down
8 changes: 4 additions & 4 deletions Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
b = Parameter(outDim, 1, init = fixedValue, value = bValue)
sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01)
m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, isd, spatial = false)
bn = BatchNormalization(t, sc, b, m, var, spatial = false)
y = RectifiedLinear(bn)
]

Expand Down Expand Up @@ -47,9 +47,9 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
b = Parameter(outMap, 1, init = fixedValue, value = bValue)
sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
var = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)

c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
bn = BatchNormalization(c, sc, b, m, isd, spatial = true, imageLayout = "cudnn")
bn = BatchNormalization(c, sc, b, m, var, spatial = true, imageLayout = "cudnn")
y = RectifiedLinear(bn);
]
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
# CUB_PATH= path to NVIDIA CUB installation, so $(CUB_PATH)/cub/cub.cuh exists
# defaults to /usr/local/cub-1.4.1
# CUDNN_PATH= path to NVIDIA cuDNN installation so $(CUDNN_PATH)/cuda/include/cudnn.h exists
# If not specified, CNTK will be be built without cuDNN.
# CuDNN version needs to be 5.0 or higher.
# KALDI_PATH= Path to Kaldi
# If not specified, Kaldi plugins will not be built
# OPENCV_PATH= path to OpenCV 3.1.0 installation, so $(OPENCV_PATH) exists
Expand Down
4 changes: 2 additions & 2 deletions Source/ActionsLib/NDLNetworkBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -491,15 +491,15 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
else if (cnNodeType == OperationNameOf(BatchNormalizationNode))
{
if (parameter.size() != 5)
RuntimeError("%ls should have 5 fixed parameters[inputValueNodeName, scale, bias, runMean, runInvStdDev].", cnNodeType.c_str());
RuntimeError("%ls should have 5 fixed parameters[inputValueNodeName, scale, bias, runMean, runVariance].", cnNodeType.c_str());

// setup the parameter position of children so we can hook them up later
nodeParamCount = 5;
nodeParamStart = 0;

if (pass == ndlPassInitial)
{
int id = 5; // skip inputValueNode, scale and bias, runMean, runInvStdDev.
int id = 5; // skip inputValueNode, scale and bias, runMean, runVariance.
// evaluate only scalar parameters
vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);

Expand Down
16 changes: 8 additions & 8 deletions Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
Original file line number Diff line number Diff line change
Expand Up @@ -141,13 +141,13 @@ BatchNormalizationLayer {spatialRank = 0, # reduce over these dims. E.g. 2 to r
normalizationTimeConstant = 0, blendTimeConstant = 0,
epsilon = 0.00001, useCntkEngine = true} =
{
#normShape = _ConcatArrays (Repeat (spatialRank, 1), 0) # spatial dims get a dimension of 1 (broadcasting, while all others are inferred from input)
normShape = (0:1) # TODO: Update this once we support broadcasting-style parameters.
scale = ParameterTensor {normShape, initValue = initialScale}
bias = ParameterTensor {normShape, initValue = 0}
runMean = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0} # note: disable learning since these are updated differently
runInvStdDev = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0}
apply (x) = BatchNormalization (x, scale, bias, runMean, runInvStdDev, spatialRank > 0, normalizationTimeConstant = normalizationTimeConstant, blendTimeConstant = blendTimeConstant, epsilon = epsilon, useCntkEngine = useCntkEngine)
#normShape = _ConcatArrays (Repeat (spatialRank, 1), 0) # spatial dims get a dimension of 1 (broadcasting, while all others are inferred from input)
normShape = (0:1) # TODO: Update this once we support broadcasting-style parameters.
scale = ParameterTensor {normShape, initValue = initialScale}
bias = ParameterTensor {normShape, initValue = 0}
runMean = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0} # note: disable learning since these are updated differently
runVariance = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0}
apply (x) = BatchNormalization (x, scale, bias, runMean, runVariance, spatialRank > 0, normalizationTimeConstant = normalizationTimeConstant, blendTimeConstant = blendTimeConstant, epsilon = epsilon, useCntkEngine = useCntkEngine)
}.apply

# LayerNormalizationLayer -- create a layer-normalization layer
Expand Down Expand Up @@ -455,7 +455,7 @@ ColumnwiseCrossProduct = KhatriRaoProduct // deprecated
ClassificationError = ErrorPrediction
Delay = PastValue

BatchNormalization(input, scale, bias, runMean, runInvStdDev, spatial, normalizationTimeConstant = 0, blendTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ]
BatchNormalization(input, scale, bias, runMean, runVariance, spatial, normalizationTimeConstant = 0, blendTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runVariance) /*plus the function args*/ ]
ClassBasedCrossEntropyWithSoftmax(labelClassDescriptorVectorSequence, mainInputInfo, mainWeight, classLogProbsBeforeSoftmax, tag='') = new ComputationNode [ operation = 'ClassBasedCrossEntropyWithSoftmax' ; inputs = (labelClassDescriptorVectorSequence : mainInputInfo : mainWeight : classLogProbsBeforeSoftmax) /*plus the function args*/ ]
Clip(minValue, maxValue, x, tag='') = new ComputationNode [ operation = 'Clip' ; inputs = (minValue : maxValue : x) /* plus the function args*/ ]
ColumnElementTimes(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'ColumnElementTimes' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ]
Expand Down
4 changes: 2 additions & 2 deletions Source/CNTKv2LibraryDll/API/CNTKLibrary.h
Original file line number Diff line number Diff line change
Expand Up @@ -1594,8 +1594,8 @@ namespace CNTK
const Variable& scale,
const Variable& bias,
const Variable& runningMean,
const Variable& runningInvStd,
bool spacial,
const Variable& runningStdDev,
bool spatial,
double normalizationTimeConstant = 0,
double blendTimeConstant = 0,
double epsilon = 0.00001,
Expand Down
2 changes: 1 addition & 1 deletion Source/CNTKv2LibraryDll/BackCompat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ namespace CNTK
else if (node->OperationName() == OperationNameOf(BatchNormalizationNode))
{
auto batchNormalizationNode = node->As<BatchNormalizationNode<ElementType>>();
primitiveFunctionConfigParameters[L"spacial"] = batchNormalizationNode->Spatial();
primitiveFunctionConfigParameters[L"spatial"] = batchNormalizationNode->Spatial();
primitiveFunctionConfigParameters[L"normalizationTimeConstant"] = batchNormalizationNode->NormalizationTimeConstant();
primitiveFunctionConfigParameters[L"blendTimeConstant"] = batchNormalizationNode->BlendTimeConstant();
primitiveFunctionConfigParameters[L"epsilon"] = batchNormalizationNode->Epsilon();
Expand Down
12 changes: 6 additions & 6 deletions Source/CNTKv2LibraryDll/Function.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ namespace CNTK
}
case PrimitiveOpType::BatchNormalization:
{
auto spacial = functionConfig[L"spacial"].GetValue<bool>();
auto spatial = functionConfig[L"spatial"].GetValue<bool>();
auto normalizationTimeConstant = functionConfig[L"normalizationTimeConstant"].GetValue<double>();
auto blendTimeConstant = functionConfig[L"blendTimeConstant"].GetValue<double>();
auto epsilon = functionConfig[L"epsilon"].GetValue<double>();
Expand All @@ -341,7 +341,7 @@ namespace CNTK
inputNodes.push_back((baseNodePtr != nullptr) ? baseNodePtr->template As<ComputationNode<ElementType>>()->shared_from_this() : nullptr);
}

computationNodePtr = builder.BatchNormalization(inputNodes[0], inputNodes[1], inputNodes[2], inputNodes[3], inputNodes[4], spacial, normalizationTimeConstant, blendTimeConstant, epsilon, !useCuDNNEngine, ImageLayoutKind::CHW, function->Name());
computationNodePtr = builder.BatchNormalization(inputNodes[0], inputNodes[1], inputNodes[2], inputNodes[3], inputNodes[4], spatial, normalizationTimeConstant, blendTimeConstant, epsilon, !useCuDNNEngine, ImageLayoutKind::CHW, function->Name());
break;
}
case PrimitiveOpType::Combine:
Expand Down Expand Up @@ -1169,23 +1169,23 @@ namespace CNTK
const Variable& scale,
const Variable& bias,
const Variable& runningMean,
const Variable& runningInvStd,
bool spacial,
const Variable& runningStdDev,
bool spatial,
double normalizationTimeConstant,
double blendTimeConstant,
double epsilon,
bool useCuDNNEngine,
const std::wstring& name)
{
auto additionalProperties = Dictionary();
additionalProperties[L"spacial"] = spacial;
additionalProperties[L"spatial"] = spatial;
additionalProperties[L"normalizationTimeConstant"] = normalizationTimeConstant;
additionalProperties[L"blendTimeConstant"] = blendTimeConstant;
additionalProperties[L"epsilon"] = epsilon;
additionalProperties[L"useCuDNNEngine"] = useCuDNNEngine;

return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::BatchNormalization,
std::vector<Variable>({ operand, scale, bias, runningMean, runningInvStd }),
std::vector<Variable>({ operand, scale, bias, runningMean, runningStdDev }),
std::move(additionalProperties),
name),
name);
Expand Down
Loading

0 comments on commit f76afa2

Please sign in to comment.