Switch to CuDNN v5

For batch normalization, running inverse standard deviation becomes running variance. We mirror this CuDNN v5 change in the CNTK batch normalization engine. Model version is bumped. When old models are loaded, this parameter is (approximately) converted. In the same model version change, let batch normalization count samples seen rather minibatches (this deals with incorrect averaging when minibatch size is varied across epochs). For batch normalization averaging and blending handle initialization cases, don't rely on mean and variance initial values (set in NDL/BrainScript). Update Windows / Linux / Docker build. With this commit, CuDNN v4 is not supported anymore.
MikalaiDrabovich · Aug 22, 2016 · f76afa2 · f76afa2
1 parent 46a10ad
commit f76afa2
Show file tree

Hide file tree

Showing 62 changed files with 3,293 additions and 541 deletions.
diff --git a/CNTK.Cpp.props b/CNTK.Cpp.props
@@ -1,6 +1,8 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <PropertyGroup>
+    <CUDNN_PATH>C:\NVIDIA\cudnn-5.0\cuda</CUDNN_PATH>
+
     <!-- Note: SolutionDir / RepoRootPath are the same in current setup -->
     <RepoRootPath>$(MSBuildThisFileDirectory)</RepoRootPath>
     <RelativeProjectPath>$(MSBuildProjectDirectory.Substring($(MSBuildThisFileDirectory.Length)))</RelativeProjectPath>

diff --git a/Examples/Image/MNIST/Config/Macros.ndl b/Examples/Image/MNIST/Config/Macros.ndl
@@ -26,9 +26,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
     b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
     sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
     m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, var, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
     y = RectifiedLinear(bn)
 ]
 
@@ -72,10 +72,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
     b = LearnableParameter(outMap, 1, init=fixedValue, value=bValue)
     sc = LearnableParameter(outMap, 1, init=fixedValue, value=scValue)
     m = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
-    isd = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
+    var = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
 
     c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true, imageLayout=$imageLayout$)
-    y = BatchNormalization(c, sc, b, m, isd, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst, imageLayout=$imageLayout$)
+    y = BatchNormalization(c, sc, b, m, var, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst, imageLayout=$imageLayout$)
 ]
 
 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [

diff --git a/Examples/Image/MNIST/Config/Shared.bs b/Examples/Image/MNIST/Config/Shared.bs
@@ -26,9 +26,9 @@ DnnBNReLULayer (inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
     b   = Parameter (outDim, 1, init = "fixedValue", value = bValue) 
     sc  = Parameter (outDim, 1, init = "fixedValue", value = scValue) 
     m   = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
-    isd = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
+    var = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
     t = Times(W, x)  # TODO: W * x
-    bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, var, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
     y = RectifiedLinear(bn)
 ].y
 
@@ -61,10 +61,10 @@ ConvBNLayerW (W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeC
     b   = Parameter(outMap, 1, init="fixedValue", value=bValue)
     sc  = Parameter(outMap, 1, init="fixedValue", value=scValue)
     m   = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
-    isd = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
+    var = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
 
     c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true /* , imageLayout=$imageLayout$*/)
-    y = BatchNormalization(c, sc, b, m, isd, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst /* , imageLayout=$imageLayout$*/)
+    y = BatchNormalization(c, sc, b, m, var, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst /* , imageLayout=$imageLayout$*/)
 ].y
 
 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [

diff --git a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
@@ -21,10 +21,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
     b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
     sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
     m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
 
     c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
-    y = BatchNormalization(c, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
+    y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
 ]
 
 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
@@ -44,10 +44,10 @@ ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
     b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
     sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
     m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
 
     c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
-    y = BatchNormalization(c, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
+    y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
 ]
 
 ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
@@ -113,9 +113,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
     b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
     sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
     m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, isd, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst)
     y = RectifiedLinear(bn)
 ]
 
@@ -125,9 +125,9 @@ DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeCon
     b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
     sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
     m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, isd, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst)
     y = RectifiedLinear(bn)
 ]
 

diff --git a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
@@ -8,9 +8,9 @@ BN(inp, mapCount, bValue, scValue, bnTimeConst)
     b = Parameter(mapCount, 1, init = fixedValue, value = bValue)
     sc = Parameter(mapCount, 1, init = fixedValue, value = scValue)
     m = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
 
-    y = BatchNormalization(inp, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
+    y = BatchNormalization(inp, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
 ]
 
 ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)

diff --git a/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl b/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
@@ -15,9 +15,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
     b = Parameter(outDim, 1, init = fixedValue, value = bValue) 
     sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01)
     m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, isd, spatial = false)
+    bn = BatchNormalization(t, sc, b, m, var, spatial = false)
     y = RectifiedLinear(bn)
 ]
 
@@ -47,9 +47,9 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
     b = Parameter(outMap, 1, init = fixedValue, value = bValue)
     sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
     m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
 
     c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
-    bn = BatchNormalization(c, sc, b, m, isd, spatial = true, imageLayout = "cudnn")
+    bn = BatchNormalization(c, sc, b, m, var, spatial = true, imageLayout = "cudnn")
     y = RectifiedLinear(bn);
 ]
diff --git a/Makefile b/Makefile
@@ -28,7 +28,7 @@
 #   CUB_PATH= path to NVIDIA CUB installation, so $(CUB_PATH)/cub/cub.cuh exists
 #     defaults to /usr/local/cub-1.4.1
 #   CUDNN_PATH= path to NVIDIA cuDNN installation so $(CUDNN_PATH)/cuda/include/cudnn.h exists
-#     If not specified, CNTK will be be built without cuDNN.
+#     CuDNN version needs to be 5.0 or higher.
 #   KALDI_PATH= Path to Kaldi
 #     If not specified, Kaldi plugins will not be built
 #   OPENCV_PATH= path to OpenCV 3.1.0 installation, so $(OPENCV_PATH) exists

diff --git a/Source/ActionsLib/NDLNetworkBuilder.cpp b/Source/ActionsLib/NDLNetworkBuilder.cpp
@@ -491,15 +491,15 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
     else if (cnNodeType == OperationNameOf(BatchNormalizationNode))
     {
         if (parameter.size() != 5)
-            RuntimeError("%ls should have 5 fixed parameters[inputValueNodeName, scale, bias, runMean, runInvStdDev].", cnNodeType.c_str());
+            RuntimeError("%ls should have 5 fixed parameters[inputValueNodeName, scale, bias, runMean, runVariance].", cnNodeType.c_str());
 
         // setup the parameter position of children so we can hook them up later
         nodeParamCount = 5;
         nodeParamStart = 0;
 
         if (pass == ndlPassInitial)
         {
-            int id = 5; // skip inputValueNode, scale and bias, runMean, runInvStdDev.
+            int id = 5; // skip inputValueNode, scale and bias, runMean, runVariance.
             // evaluate only scalar parameters
             vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
 

diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@@ -141,13 +141,13 @@ BatchNormalizationLayer {spatialRank = 0,  # reduce over these dims. E.g. 2 to r
                          normalizationTimeConstant = 0, blendTimeConstant = 0,
                          epsilon = 0.00001, useCntkEngine = true} =
 {
-    #normShape = _ConcatArrays (Repeat (spatialRank, 1), 0) # spatial dims get a dimension of 1 (broadcasting, while all others are inferred from input)
-    normShape = (0:1)  # TODO: Update this once we support broadcasting-style parameters.
-    scale        = ParameterTensor {normShape, initValue = initialScale}
-    bias         = ParameterTensor {normShape, initValue = 0}
-    runMean      = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0} # note: disable learning since these are updated differently
-    runInvStdDev = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0}
-    apply (x) = BatchNormalization (x, scale, bias, runMean, runInvStdDev, spatialRank > 0, normalizationTimeConstant = normalizationTimeConstant, blendTimeConstant = blendTimeConstant, epsilon = epsilon, useCntkEngine = useCntkEngine)
+    #normShape   = _ConcatArrays (Repeat (spatialRank, 1), 0) # spatial dims get a dimension of 1 (broadcasting, while all others are inferred from input)
+    normShape   = (0:1)  # TODO: Update this once we support broadcasting-style parameters.
+    scale       = ParameterTensor {normShape, initValue = initialScale}
+    bias        = ParameterTensor {normShape, initValue = 0}
+    runMean     = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0} # note: disable learning since these are updated differently
+    runVariance = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0}
+    apply (x)   = BatchNormalization (x, scale, bias, runMean, runVariance, spatialRank > 0, normalizationTimeConstant = normalizationTimeConstant, blendTimeConstant = blendTimeConstant, epsilon = epsilon, useCntkEngine = useCntkEngine)
 }.apply
 
 # LayerNormalizationLayer -- create a layer-normalization layer
@@ -455,7 +455,7 @@ ColumnwiseCrossProduct = KhatriRaoProduct // deprecated
 ClassificationError = ErrorPrediction 
 Delay = PastValue 
 
-BatchNormalization(input, scale, bias, runMean, runInvStdDev, spatial, normalizationTimeConstant = 0, blendTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ]
+BatchNormalization(input, scale, bias, runMean, runVariance, spatial, normalizationTimeConstant = 0, blendTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runVariance) /*plus the function args*/ ]
 ClassBasedCrossEntropyWithSoftmax(labelClassDescriptorVectorSequence, mainInputInfo, mainWeight, classLogProbsBeforeSoftmax, tag='') = new ComputationNode [ operation = 'ClassBasedCrossEntropyWithSoftmax' ; inputs = (labelClassDescriptorVectorSequence : mainInputInfo : mainWeight : classLogProbsBeforeSoftmax) /*plus the function args*/ ]
 Clip(minValue, maxValue, x, tag='') = new ComputationNode [ operation = 'Clip' ; inputs = (minValue : maxValue : x) /* plus the function args*/ ]
 ColumnElementTimes(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'ColumnElementTimes' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ]

diff --git a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@@ -1594,8 +1594,8 @@ namespace CNTK
                                             const Variable& scale,
                                             const Variable& bias,
                                             const Variable& runningMean,
-                                            const Variable& runningInvStd,
-                                            bool spacial,
+                                            const Variable& runningStdDev,
+                                            bool spatial,
                                             double normalizationTimeConstant = 0,
                                             double blendTimeConstant = 0,
                                             double epsilon = 0.00001,

diff --git a/Source/CNTKv2LibraryDll/BackCompat.cpp b/Source/CNTKv2LibraryDll/BackCompat.cpp
@@ -206,7 +206,7 @@ namespace CNTK
             else if (node->OperationName() == OperationNameOf(BatchNormalizationNode))
             {
                 auto batchNormalizationNode = node->As<BatchNormalizationNode<ElementType>>();
-                primitiveFunctionConfigParameters[L"spacial"] = batchNormalizationNode->Spatial();
+                primitiveFunctionConfigParameters[L"spatial"] = batchNormalizationNode->Spatial();
                 primitiveFunctionConfigParameters[L"normalizationTimeConstant"] = batchNormalizationNode->NormalizationTimeConstant();
                 primitiveFunctionConfigParameters[L"blendTimeConstant"] = batchNormalizationNode->BlendTimeConstant();
                 primitiveFunctionConfigParameters[L"epsilon"] = batchNormalizationNode->Epsilon();

diff --git a/Source/CNTKv2LibraryDll/Function.cpp b/Source/CNTKv2LibraryDll/Function.cpp
@@ -329,7 +329,7 @@ namespace CNTK
             }
             case PrimitiveOpType::BatchNormalization:
             {
-                auto spacial = functionConfig[L"spacial"].GetValue<bool>();
+                auto spatial = functionConfig[L"spatial"].GetValue<bool>();
                 auto normalizationTimeConstant = functionConfig[L"normalizationTimeConstant"].GetValue<double>();
                 auto blendTimeConstant = functionConfig[L"blendTimeConstant"].GetValue<double>();
                 auto epsilon = functionConfig[L"epsilon"].GetValue<double>();
@@ -341,7 +341,7 @@ namespace CNTK
                     inputNodes.push_back((baseNodePtr != nullptr) ? baseNodePtr->template As<ComputationNode<ElementType>>()->shared_from_this() : nullptr);
                 }
 
-                computationNodePtr = builder.BatchNormalization(inputNodes[0], inputNodes[1], inputNodes[2], inputNodes[3], inputNodes[4], spacial, normalizationTimeConstant, blendTimeConstant, epsilon, !useCuDNNEngine, ImageLayoutKind::CHW, function->Name());
+                computationNodePtr = builder.BatchNormalization(inputNodes[0], inputNodes[1], inputNodes[2], inputNodes[3], inputNodes[4], spatial, normalizationTimeConstant, blendTimeConstant, epsilon, !useCuDNNEngine, ImageLayoutKind::CHW, function->Name());
                 break;
             }
             case PrimitiveOpType::Combine:
@@ -1169,23 +1169,23 @@ namespace CNTK
                                    const Variable& scale,
                                    const Variable& bias,
                                    const Variable& runningMean,
-                                   const Variable& runningInvStd,
-                                   bool spacial,
+                                   const Variable& runningStdDev,
+                                   bool spatial,
                                    double normalizationTimeConstant,
                                    double blendTimeConstant,
                                    double epsilon,
                                    bool useCuDNNEngine,
                                    const std::wstring& name)
     {
         auto additionalProperties = Dictionary();
-        additionalProperties[L"spacial"] = spacial;
+        additionalProperties[L"spatial"] = spatial;
         additionalProperties[L"normalizationTimeConstant"] = normalizationTimeConstant;
         additionalProperties[L"blendTimeConstant"] = blendTimeConstant;
         additionalProperties[L"epsilon"] = epsilon;
         additionalProperties[L"useCuDNNEngine"] = useCuDNNEngine;
 
         return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::BatchNormalization,
-                                                                             std::vector<Variable>({ operand, scale, bias, runningMean, runningInvStd }),
+                                                                             std::vector<Variable>({ operand, scale, bias, runningMean, runningStdDev }),
                                                                              std::move(additionalProperties),
                                                                              name),
                                          name);