From f76afa2b7e7f6de04e09775c439ef843137107f3 Mon Sep 17 00:00:00 2001
From: Mark Hillebrand <mahilleb@microsoft.com>
Date: Mon, 8 Aug 2016 09:13:39 +0200
Subject: [PATCH] Switch to CuDNN v5

For batch normalization, running inverse standard deviation becomes
running variance. We mirror this CuDNN v5 change in the CNTK batch
normalization engine. Model version is bumped. When old models are
loaded, this parameter is (approximately) converted.

In the same model version change, let batch normalization count
samples seen rather minibatches (this deals with incorrect averaging
when minibatch size is varied across epochs).

For batch normalization averaging and blending handle initialization
cases, don't rely on mean and variance initial values (set in
NDL/BrainScript).

Update Windows / Linux / Docker build.
With this commit, CuDNN v4 is not supported anymore.
---
 CNTK.Cpp.props                                |   2 +
 Examples/Image/MNIST/Config/Macros.ndl        |   8 +-
 Examples/Image/MNIST/Config/Shared.bs         |   8 +-
 .../Image/Miscellaneous/CIFAR-10/Macros.ndl   |  16 +-
 .../Miscellaneous/ImageNet/ResNet/Macros.ndl  |   4 +-
 .../Miscellaneous/ImageNet/VGG/Macros.ndl     |   8 +-
 Makefile                                      |   2 +-
 Source/ActionsLib/NDLNetworkBuilder.cpp       |   4 +-
 .../CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs |  16 +-
 Source/CNTKv2LibraryDll/API/CNTKLibrary.h     |   4 +-
 Source/CNTKv2LibraryDll/BackCompat.cpp        |   2 +-
 Source/CNTKv2LibraryDll/Function.cpp          |  12 +-
 .../ComputationNetworkBuilder.cpp             |   4 +-
 .../ComputationNetworkBuilder.h               |   2 +-
 .../ComputationNetworkLib/ComputationNode.h   |   3 +-
 Source/ComputationNetworkLib/TrainingNodes.h  | 137 ++--
 Source/Math/BatchNormalizationEngine.cpp      |  14 +-
 Source/Math/BatchNormalizationEngine.h        |   4 +-
 Source/Math/CPUMatrix.cpp                     |  11 +-
 Source/Math/CPUMatrix.h                       |   8 +-
 Source/Math/CntkBatchNormalization.cuh        | 241 ++++---
 Source/Math/CuDnnBatchNormalization.cu        |  13 +-
 Source/Math/CuDnnCommon.h                     |   3 +
 Source/Math/CuDnnConvolutionEngine.cu         |   6 -
 Source/Math/GPUMatrix.cu                      |  65 +-
 Source/Math/GPUMatrix.h                       |   2 +-
 Source/Math/MathCUDA.vcxproj                  |   2 +-
 Source/Math/Matrix.cpp                        |   6 +-
 Source/Math/Matrix.h                          |   2 +-
 Source/Math/NoGPU.cpp                         |   2 +-
 Source/SGDLib/SGD.cpp                         |   1 +
 .../NonSpatial/01_OneHidden.cntk              |  81 +++
 .../NonSpatial/01_OneHidden.ndl               |  39 ++
 .../NonSpatial/CNTK/baseline.linux.txt        |   1 +
 .../NonSpatial/CNTK/baseline.windows.txt      | 480 ++++++++++++++
 .../NonSpatial/CNTK/run-test                  |  19 +
 .../NonSpatial/CNTK/testcases.yml             |  38 ++
 .../NonSpatial/CuDNN/baseline.linux.txt       |   1 +
 .../NonSpatial/CuDNN/baseline.windows.txt     | 480 ++++++++++++++
 .../NonSpatial/CuDNN/run-test                 |  11 +
 .../NonSpatial/CuDNN/testcases.yml            |  39 ++
 .../NonSpatial/run-test-common                |  26 +
 .../Spatial/02_BatchNormConv.cntk             |  79 +++
 .../Spatial/02_BatchNormConv.ndl              |  65 ++
 .../Spatial/CNTK/baseline.linux.txt           |   1 +
 .../Spatial/CNTK/baseline.windows.txt         | 494 +++++++++++++++
 .../BatchNormalization/Spatial/CNTK/run-test  |  19 +
 .../Spatial/CNTK/testcases.yml                |  31 +
 .../Spatial/CuDNN/baseline.linux.txt          |   1 +
 .../Spatial/CuDNN/baseline.windows.txt        | 494 +++++++++++++++
 .../BatchNormalization/Spatial/CuDNN/run-test |  11 +
 .../Spatial/CuDNN/testcases.yml               |  31 +
 .../BatchNormalization/Spatial/Macros.ndl     | 148 +++++
 .../Spatial/run-test-common                   |  26 +
 .../02_BatchNormConv/baseline.windows.txt     | 588 +++++++++---------
 Tests/UnitTests/EvalTests/EvalTests.vcxproj   |   2 +-
 .../BatchNormalizationEngineTests.cpp         |   5 +-
 .../MathTests/ConvolutionEngineTests.cpp      |   2 +-
 Tests/UnitTests/MathTests/MathTests.vcxproj   |   2 +-
 Tests/UnitTests/V2LibraryTests/Image.h        |   4 +-
 Tools/docker/CNTK-GPU-Image/Dockerfile        |   2 +-
 configure                                     |   2 +-
 62 files changed, 3293 insertions(+), 541 deletions(-)
 create mode 100644 Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.cntk
 create mode 100644 Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.ndl
 create mode 100644 Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/baseline.linux.txt
 create mode 100644 Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/baseline.windows.txt
 create mode 100755 Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/run-test
 create mode 100644 Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/testcases.yml
 create mode 100644 Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/baseline.linux.txt
 create mode 100644 Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/baseline.windows.txt
 create mode 100755 Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/run-test
 create mode 100644 Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/testcases.yml
 create mode 100755 Tests/EndToEndTests/BatchNormalization/NonSpatial/run-test-common
 create mode 100644 Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.cntk
 create mode 100644 Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.ndl
 create mode 100644 Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/baseline.linux.txt
 create mode 100644 Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/baseline.windows.txt
 create mode 100755 Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/run-test
 create mode 100644 Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/testcases.yml
 create mode 100644 Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/baseline.linux.txt
 create mode 100644 Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/baseline.windows.txt
 create mode 100755 Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/run-test
 create mode 100644 Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/testcases.yml
 create mode 100644 Tests/EndToEndTests/BatchNormalization/Spatial/Macros.ndl
 create mode 100755 Tests/EndToEndTests/BatchNormalization/Spatial/run-test-common

diff --git a/CNTK.Cpp.props b/CNTK.Cpp.props
index b8f9db6739e3..c6f665fcc4d5 100644
--- a/CNTK.Cpp.props
+++ b/CNTK.Cpp.props
@@ -1,6 +1,8 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <PropertyGroup>
+    <CUDNN_PATH>C:\NVIDIA\cudnn-5.0\cuda</CUDNN_PATH>
+
     <!-- Note: SolutionDir / RepoRootPath are the same in current setup -->
     <RepoRootPath>$(MSBuildThisFileDirectory)</RepoRootPath>
     <RelativeProjectPath>$(MSBuildProjectDirectory.Substring($(MSBuildThisFileDirectory.Length)))</RelativeProjectPath>
diff --git a/Examples/Image/MNIST/Config/Macros.ndl b/Examples/Image/MNIST/Config/Macros.ndl
index 0d57d869a848..cfa1e2baaf37 100644
--- a/Examples/Image/MNIST/Config/Macros.ndl
+++ b/Examples/Image/MNIST/Config/Macros.ndl
@@ -26,9 +26,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
     b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
     sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
     m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, var, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
     y = RectifiedLinear(bn)
 ]
 
@@ -72,10 +72,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
     b = LearnableParameter(outMap, 1, init=fixedValue, value=bValue)
     sc = LearnableParameter(outMap, 1, init=fixedValue, value=scValue)
     m = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
-    isd = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
+    var = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
     
     c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true, imageLayout=$imageLayout$)
-    y = BatchNormalization(c, sc, b, m, isd, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst, imageLayout=$imageLayout$)
+    y = BatchNormalization(c, sc, b, m, var, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst, imageLayout=$imageLayout$)
 ]
 
 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [
diff --git a/Examples/Image/MNIST/Config/Shared.bs b/Examples/Image/MNIST/Config/Shared.bs
index 2c66ad109969..2354465df26f 100644
--- a/Examples/Image/MNIST/Config/Shared.bs
+++ b/Examples/Image/MNIST/Config/Shared.bs
@@ -26,9 +26,9 @@ DnnBNReLULayer (inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
     b   = Parameter (outDim, 1, init = "fixedValue", value = bValue) 
     sc  = Parameter (outDim, 1, init = "fixedValue", value = scValue) 
     m   = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
-    isd = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
+    var = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
     t = Times(W, x)  # TODO: W * x
-    bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, var, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
     y = RectifiedLinear(bn)
 ].y
 
@@ -61,10 +61,10 @@ ConvBNLayerW (W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeC
     b   = Parameter(outMap, 1, init="fixedValue", value=bValue)
     sc  = Parameter(outMap, 1, init="fixedValue", value=scValue)
     m   = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
-    isd = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
+    var = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
     
     c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true /* , imageLayout=$imageLayout$*/)
-    y = BatchNormalization(c, sc, b, m, isd, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst /* , imageLayout=$imageLayout$*/)
+    y = BatchNormalization(c, sc, b, m, var, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst /* , imageLayout=$imageLayout$*/)
 ].y
 
 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
index 852b1a5a761e..552f51951f80 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
@@ -21,10 +21,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
     b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
     sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
     m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     
     c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
-    y = BatchNormalization(c, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
+    y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
 ]
 
 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
@@ -44,10 +44,10 @@ ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
     b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
     sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
     m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     
     c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
-    y = BatchNormalization(c, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
+    y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
 ]
 
 ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
@@ -113,9 +113,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
     b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
     sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
     m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, isd, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst)
     y = RectifiedLinear(bn)
 ]
 
@@ -125,9 +125,9 @@ DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeCon
     b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
     sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
     m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, isd, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst)
     y = RectifiedLinear(bn)
 ]
 
diff --git a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
index 0d7c1fd989db..ca15855264dd 100644
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
@@ -8,9 +8,9 @@ BN(inp, mapCount, bValue, scValue, bnTimeConst)
     b = Parameter(mapCount, 1, init = fixedValue, value = bValue)
     sc = Parameter(mapCount, 1, init = fixedValue, value = scValue)
     m = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     
-    y = BatchNormalization(inp, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
+    y = BatchNormalization(inp, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
 ]
 
 ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
diff --git a/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl b/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
index f2b53a46e52a..82d3b8582358 100644
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
@@ -15,9 +15,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
     b = Parameter(outDim, 1, init = fixedValue, value = bValue) 
     sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01)
     m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, isd, spatial = false)
+    bn = BatchNormalization(t, sc, b, m, var, spatial = false)
     y = RectifiedLinear(bn)
 ]
 
@@ -47,9 +47,9 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
     b = Parameter(outMap, 1, init = fixedValue, value = bValue)
     sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
     m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     
     c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
-    bn = BatchNormalization(c, sc, b, m, isd, spatial = true, imageLayout = "cudnn")
+    bn = BatchNormalization(c, sc, b, m, var, spatial = true, imageLayout = "cudnn")
     y = RectifiedLinear(bn);
 ]
diff --git a/Makefile b/Makefile
index 04643adb59ff..ed44be9d096f 100644
--- a/Makefile
+++ b/Makefile
@@ -28,7 +28,7 @@
 #   CUB_PATH= path to NVIDIA CUB installation, so $(CUB_PATH)/cub/cub.cuh exists
 #     defaults to /usr/local/cub-1.4.1
 #   CUDNN_PATH= path to NVIDIA cuDNN installation so $(CUDNN_PATH)/cuda/include/cudnn.h exists
-#     If not specified, CNTK will be be built without cuDNN.
+#     CuDNN version needs to be 5.0 or higher.
 #   KALDI_PATH= Path to Kaldi
 #     If not specified, Kaldi plugins will not be built
 #   OPENCV_PATH= path to OpenCV 3.1.0 installation, so $(OPENCV_PATH) exists
diff --git a/Source/ActionsLib/NDLNetworkBuilder.cpp b/Source/ActionsLib/NDLNetworkBuilder.cpp
index 2f7545cb1dbc..a6cd11caab36 100644
--- a/Source/ActionsLib/NDLNetworkBuilder.cpp
+++ b/Source/ActionsLib/NDLNetworkBuilder.cpp
@@ -491,7 +491,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
     else if (cnNodeType == OperationNameOf(BatchNormalizationNode))
     {
         if (parameter.size() != 5)
-            RuntimeError("%ls should have 5 fixed parameters[inputValueNodeName, scale, bias, runMean, runInvStdDev].", cnNodeType.c_str());
+            RuntimeError("%ls should have 5 fixed parameters[inputValueNodeName, scale, bias, runMean, runVariance].", cnNodeType.c_str());
 
         // setup the parameter position of children so we can hook them up later
         nodeParamCount = 5;
@@ -499,7 +499,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
 
         if (pass == ndlPassInitial)
         {
-            int id = 5; // skip inputValueNode, scale and bias, runMean, runInvStdDev.
+            int id = 5; // skip inputValueNode, scale and bias, runMean, runVariance.
             // evaluate only scalar parameters
             vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
 
diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
index cc97de6f8268..e9411eb4fdfb 100644
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@@ -141,13 +141,13 @@ BatchNormalizationLayer {spatialRank = 0,  # reduce over these dims. E.g. 2 to r
                          normalizationTimeConstant = 0, blendTimeConstant = 0,
                          epsilon = 0.00001, useCntkEngine = true} =
 {
-    #normShape = _ConcatArrays (Repeat (spatialRank, 1), 0) # spatial dims get a dimension of 1 (broadcasting, while all others are inferred from input)
-    normShape = (0:1)  # TODO: Update this once we support broadcasting-style parameters.
-    scale        = ParameterTensor {normShape, initValue = initialScale}
-    bias         = ParameterTensor {normShape, initValue = 0}
-    runMean      = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0} # note: disable learning since these are updated differently
-    runInvStdDev = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0}
-    apply (x) = BatchNormalization (x, scale, bias, runMean, runInvStdDev, spatialRank > 0, normalizationTimeConstant = normalizationTimeConstant, blendTimeConstant = blendTimeConstant, epsilon = epsilon, useCntkEngine = useCntkEngine)
+    #normShape   = _ConcatArrays (Repeat (spatialRank, 1), 0) # spatial dims get a dimension of 1 (broadcasting, while all others are inferred from input)
+    normShape   = (0:1)  # TODO: Update this once we support broadcasting-style parameters.
+    scale       = ParameterTensor {normShape, initValue = initialScale}
+    bias        = ParameterTensor {normShape, initValue = 0}
+    runMean     = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0} # note: disable learning since these are updated differently
+    runVariance = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0}
+    apply (x)   = BatchNormalization (x, scale, bias, runMean, runVariance, spatialRank > 0, normalizationTimeConstant = normalizationTimeConstant, blendTimeConstant = blendTimeConstant, epsilon = epsilon, useCntkEngine = useCntkEngine)
 }.apply
 
 # LayerNormalizationLayer -- create a layer-normalization layer
@@ -455,7 +455,7 @@ ColumnwiseCrossProduct = KhatriRaoProduct // deprecated
 ClassificationError = ErrorPrediction 
 Delay = PastValue 
 
-BatchNormalization(input, scale, bias, runMean, runInvStdDev, spatial, normalizationTimeConstant = 0, blendTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ]
+BatchNormalization(input, scale, bias, runMean, runVariance, spatial, normalizationTimeConstant = 0, blendTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runVariance) /*plus the function args*/ ]
 ClassBasedCrossEntropyWithSoftmax(labelClassDescriptorVectorSequence, mainInputInfo, mainWeight, classLogProbsBeforeSoftmax, tag='') = new ComputationNode [ operation = 'ClassBasedCrossEntropyWithSoftmax' ; inputs = (labelClassDescriptorVectorSequence : mainInputInfo : mainWeight : classLogProbsBeforeSoftmax) /*plus the function args*/ ]
 Clip(minValue, maxValue, x, tag='') = new ComputationNode [ operation = 'Clip' ; inputs = (minValue : maxValue : x) /* plus the function args*/ ]
 ColumnElementTimes(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'ColumnElementTimes' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ]
diff --git a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
index be1c1bfa72bc..f6a3581c65a9 100644
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@@ -1594,8 +1594,8 @@ namespace CNTK
                                             const Variable& scale,
                                             const Variable& bias,
                                             const Variable& runningMean,
-                                            const Variable& runningInvStd,
-                                            bool spacial,
+                                            const Variable& runningStdDev,
+                                            bool spatial,
                                             double normalizationTimeConstant = 0,
                                             double blendTimeConstant = 0,
                                             double epsilon = 0.00001,
diff --git a/Source/CNTKv2LibraryDll/BackCompat.cpp b/Source/CNTKv2LibraryDll/BackCompat.cpp
index a785d9c585d7..4a4f1686585f 100644
--- a/Source/CNTKv2LibraryDll/BackCompat.cpp
+++ b/Source/CNTKv2LibraryDll/BackCompat.cpp
@@ -206,7 +206,7 @@ namespace CNTK
             else if (node->OperationName() == OperationNameOf(BatchNormalizationNode))
             {
                 auto batchNormalizationNode = node->As<BatchNormalizationNode<ElementType>>();
-                primitiveFunctionConfigParameters[L"spacial"] = batchNormalizationNode->Spatial();
+                primitiveFunctionConfigParameters[L"spatial"] = batchNormalizationNode->Spatial();
                 primitiveFunctionConfigParameters[L"normalizationTimeConstant"] = batchNormalizationNode->NormalizationTimeConstant();
                 primitiveFunctionConfigParameters[L"blendTimeConstant"] = batchNormalizationNode->BlendTimeConstant();
                 primitiveFunctionConfigParameters[L"epsilon"] = batchNormalizationNode->Epsilon();
diff --git a/Source/CNTKv2LibraryDll/Function.cpp b/Source/CNTKv2LibraryDll/Function.cpp
index e1c17439bf5e..7201e53e352a 100644
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@@ -329,7 +329,7 @@ namespace CNTK
             }
             case PrimitiveOpType::BatchNormalization:
             {
-                auto spacial = functionConfig[L"spacial"].GetValue<bool>();
+                auto spatial = functionConfig[L"spatial"].GetValue<bool>();
                 auto normalizationTimeConstant = functionConfig[L"normalizationTimeConstant"].GetValue<double>();
                 auto blendTimeConstant = functionConfig[L"blendTimeConstant"].GetValue<double>();
                 auto epsilon = functionConfig[L"epsilon"].GetValue<double>();
@@ -341,7 +341,7 @@ namespace CNTK
                     inputNodes.push_back((baseNodePtr != nullptr) ? baseNodePtr->template As<ComputationNode<ElementType>>()->shared_from_this() : nullptr);
                 }
 
-                computationNodePtr = builder.BatchNormalization(inputNodes[0], inputNodes[1], inputNodes[2], inputNodes[3], inputNodes[4], spacial, normalizationTimeConstant, blendTimeConstant, epsilon, !useCuDNNEngine, ImageLayoutKind::CHW, function->Name());
+                computationNodePtr = builder.BatchNormalization(inputNodes[0], inputNodes[1], inputNodes[2], inputNodes[3], inputNodes[4], spatial, normalizationTimeConstant, blendTimeConstant, epsilon, !useCuDNNEngine, ImageLayoutKind::CHW, function->Name());
                 break;
             }
             case PrimitiveOpType::Combine:
@@ -1169,8 +1169,8 @@ namespace CNTK
                                    const Variable& scale,
                                    const Variable& bias,
                                    const Variable& runningMean,
-                                   const Variable& runningInvStd,
-                                   bool spacial,
+                                   const Variable& runningStdDev,
+                                   bool spatial,
                                    double normalizationTimeConstant,
                                    double blendTimeConstant,
                                    double epsilon,
@@ -1178,14 +1178,14 @@ namespace CNTK
                                    const std::wstring& name)
     {
         auto additionalProperties = Dictionary();
-        additionalProperties[L"spacial"] = spacial;
+        additionalProperties[L"spatial"] = spatial;
         additionalProperties[L"normalizationTimeConstant"] = normalizationTimeConstant;
         additionalProperties[L"blendTimeConstant"] = blendTimeConstant;
         additionalProperties[L"epsilon"] = epsilon;
         additionalProperties[L"useCuDNNEngine"] = useCuDNNEngine;
 
         return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::BatchNormalization,
-                                                                             std::vector<Variable>({ operand, scale, bias, runningMean, runningInvStd }),
+                                                                             std::vector<Variable>({ operand, scale, bias, runningMean, runningStdDev }),
                                                                              std::move(additionalProperties),
                                                                              name),
                                          name);
diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
index dc8ac86671b1..4f8ff5d8cbf9 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@@ -789,12 +789,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Looku
 
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::BatchNormalization(const ComputationNodePtr input,
-                                                                                              const ComputationNodePtr scale, const ComputationNodePtr bias, const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev,
+                                                                                              const ComputationNodePtr scale, const ComputationNodePtr bias, const ComputationNodePtr runMean, const ComputationNodePtr runVariance,
                                                                                               bool spatial, double normalizationTimeConstant, double blendTimeConstant, double epsilon, bool useCntkEngine,
                                                                                               ImageLayoutKind imageLayoutKind,
                                                                                               const std::wstring nodeName)
 {
-    return net.AddNodeToNetAndAttachInputs(New<BatchNormalizationNode<ElemType>>(net.GetDeviceId(), nodeName, spatial, normalizationTimeConstant, blendTimeConstant, epsilon, useCntkEngine, imageLayoutKind), { input, scale, bias, runMean, runInvStdDev });
+    return net.AddNodeToNetAndAttachInputs(New<BatchNormalizationNode<ElemType>>(net.GetDeviceId(), nodeName, spatial, normalizationTimeConstant, blendTimeConstant, epsilon, useCntkEngine, imageLayoutKind), { input, scale, bias, runMean, runVariance });
 }
 
 template class ComputationNetworkBuilder<float>;
diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
index 49442917f549..dfa3ebae1b2a 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@@ -69,7 +69,7 @@ class ComputationNetworkBuilder
     // The following functions create nodes and link them to the network and their inputs.
     // TODO: Do we need both this set and the one above that does not add inputs? Can they share more code?
     ComputationNodePtr BatchNormalization(const ComputationNodePtr input, const ComputationNodePtr scale, const ComputationNodePtr bias,
-                                          const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev, bool spatial = false, double normalizationTimeConstant = 0, double blendTimeConstant = 0, double epsilon = 1e-5, bool useCntkEngine = true,
+                                          const ComputationNodePtr runMean, const ComputationNodePtr runVariance, bool spatial = false, double normalizationTimeConstant = 0, double blendTimeConstant = 0, double epsilon = 1e-5, bool useCntkEngine = true,
                                           ImageLayoutKind imageLayoutKind = ImageLayoutKind::CHW, const std::wstring nodeName = L"");
     ComputationNodePtr Convolution(const ComputationNodePtr weight,
                                    const ComputationNodePtr inputValues,
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 853c9a159b6b..e94ffff1e38b 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -39,7 +39,8 @@
 #define CNTK_MODEL_VERSION_8 8 // DynamicAxis for inputs
 #define CNTK_MODEL_VERSION_9 9 // Transpose flag in ConvolutionNode to support deconvolution. 
 #define CNTK_MODEL_VERSION_10 10 // Learning rate multiplier for input nodes. 
-#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_10
+#define CNTK_MODEL_VERSION_11 11 // Batch norm: switch running inverse std deviation -> variance, MB count -> samplesSeen; CuDNN v5
+#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_11
 
 extern bool g_shareNodeValueMatrices;
 
diff --git a/Source/ComputationNetworkLib/TrainingNodes.h b/Source/ComputationNetworkLib/TrainingNodes.h
index 0d777db49ee4..c6a5dcfd4603 100644
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@@ -1534,7 +1534,7 @@ template class DropoutNode<float>;
 template class DropoutNode<double>;
 
 // -----------------------------------------------------------------------
-// BatchNormalizationNode (input, scale, bias, runMean, runInvStdDev,
+// BatchNormalizationNode (input, scale, bias, runMean, runVariance,
 //                         spatial, normalizationTimeConstant = 0, blendTimeConstant = 0,
 //                         epsilon = 0.00001,
 //                         useCntkEngine = true, imageLayout = 'cudnn')
@@ -1560,18 +1560,18 @@ template class DropoutNode<double>;
 //              More correct would be to infer that from broadcasting dimensions (spatial mode is broadcasting).
 // * runMean is the running mean which is used during evaluation phase and might be used during training as well.
 //      It is represented as a LearnableParameter with the same dimensions as scale and bias.
-// * runInvStdDev is the running inverse square root of variance(so InvStdDev = 1 / sqrt(var + epsilon)).
+// * runVariance is the running variance which is used during evaluation phase and might be used during training as well.
 //      It is represented as a LearnableParameter with the same dimensions as scale and bias.
-// * spatial is a flag that specifies whether to compute mean / var for each feature in a mininbatch independently or, in case of convolutional layers, per feature map.
+// * spatial is a flag that specifies whether to compute mean / var for each feature in a minibatch independently or, in case of convolutional layers, per feature map.
 //      TODO: This must be configured in a generic fashion where tensor axes are chosen along which parameters are tied.
 // * normalizationTimeConstant is the time constant which is used to compute running average of mean and variance.
-//      Value 0 (default) means there will be no exponential smoothing and running mean/variance will always have values computed for the last seen mininbatch.
-//      Value 1#INF (infinity) means running values are "frozen" (i.e.will not be updated).
+//      Value 0 (default) means there will be no exponential smoothing and running mean/variance will always have values computed for the last seen minibatch.
+//      Value 1#INF (infinity) means running values are "frozen" (i.e., they will not be updated).
 // * blendTimeConstant is the time constant which allows to specify how much of running mean / var should be "blended" into mean / var of the current minibatch.
 //      Value 0 (default) means no blending will happen and only the current minibatch statistics will be used.
 //      Value 1#INF (infinity) means only running mean / var will be used(this is used, for example, in evaluation phase).
-// * epsilon is a conditioner constant used in computing InvStdDev
-// * useCntkEngine is a boolean flag that specifies which batch normalization implementation to use : CNTK or cuDNN-based.
+// * epsilon is a conditioner constant used in computing inverted standard deviation
+// * useCntkEngine is a Boolean flag that specifies which batch normalization implementation to use: CNTK or cuDNN-based.
 // * imageLayout is the image layout. Only cudnn is supported at present.
 // -----------------------------------------------------------------------
 template <class ElemType>
@@ -1583,13 +1583,15 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
 public:
     BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name) :
         Base(deviceId, name), m_spatial(false), m_normTimeConst(0), m_blendTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
-        m_mbCount(0), m_imageLayoutKind(ImageLayoutKind::CHW)
+        m_samplesSeen(0), m_imageLayoutKind(ImageLayoutKind::CHW),
+        m_convertRunningVariance(false)
     {
     }
     BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool spatial, double normalizationTimeConstant, double blendTimeConstant,
                            double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind) :
         Base(deviceId, name), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_blendTimeConst(blendTimeConstant),
-        m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_mbCount(0)
+        m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_samplesSeen(0),
+        m_convertRunningVariance(false)
     {
     }
     BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) :
@@ -1609,7 +1611,7 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
         fstream << m_normTimeConst;
         fstream << m_blendTimeConst;
         fstream << (int32_t)m_imageLayoutKind;
-        fstream << m_mbCount;
+        fstream << m_samplesSeen;
         fstream << m_epsilon;
         fstream << m_useCntkEngine;
     }
@@ -1620,11 +1622,14 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
 
         if (modelVersion >= CNTK_MODEL_VERSION_6)
         {
+            size_t mbCount;
             fstream >> m_spatial;
             fstream >> m_normTimeConst;
             fstream >> m_blendTimeConst;
             fstream >> m_imageLayoutKind;
-            fstream >> m_mbCount;
+            fprintf(stderr, "INFO: %ls: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model\n", NodeName().c_str());
+            fstream >> mbCount;
+            m_samplesSeen = mbCount;
             fstream >> m_epsilon;
             fstream >> m_useCntkEngine;
         }
@@ -1659,8 +1664,11 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
             }
             if (verWritten >= 0x00010002)
             {
+                size_t mbCount;
                 fstream >> m_imageLayoutKind;
-                fstream >> m_mbCount;
+                fprintf(stderr, "INFO: %ls: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model\n", NodeName().c_str());
+                fstream >> mbCount;
+                m_samplesSeen = mbCount;
             }
             if (verWritten >= 0x00010003)
             {
@@ -1668,6 +1676,14 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
                 fstream >> m_useCntkEngine;
             }
         }
+
+        if (modelVersion < CNTK_MODEL_VERSION_11) 
+        {
+            // Prior to CNTK_MODEL_VERSION_11, running inverse standard
+            // deviation was stored in Input 4. Now variance is used.
+            // We (approximately) convert it during validation later.
+            m_convertRunningVariance = true;
+        }
     }
 
     void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@@ -1682,7 +1698,7 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
             node->m_normTimeConst = m_normTimeConst;
             node->m_blendTimeConst = m_blendTimeConst;
             node->m_imageLayoutKind = m_imageLayoutKind;
-            node->m_mbCount = m_mbCount;
+            node->m_samplesSeen = m_samplesSeen;
             node->m_epsilon = m_epsilon;
             node->m_useCntkEngine = m_useCntkEngine;
         }
@@ -1696,16 +1712,26 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
     {
         // in inference mode, only use long-term mean and do not update running estimates
         if (!Environment().IsTraining())
-            return 0;                                        //  (m_normTimeConst == infinity) no new contribution from current minibatch
+        {
+            assert(0 < m_samplesSeen);                       // something must have been trained
+            return 0;                                        // (m_normTimeConst == infinity) no new contribution from current minibatch
+        }
+
+        // Initialization case: only use current minibatch.
+        if (m_samplesSeen == 0)
+        {
+            return 1.0;
+        }
+
+        double numSamples = (double)GetMBLayout()->GetActualNumSamples();
 
         // REVIEW alexeyk: hack, m_normTimeConst < 0 is used to denote corpus-level statistics (without forgetting factor).
         if (m_normTimeConst < 0)
-            return 1.0 / (1.0 + m_mbCount); // (this is the hack case) TODO: verify this formula; shouldn't we use #samples instead of MB count?
+            return numSamples / (numSamples + m_samplesSeen); // (this is the hack case)
 
-        // Convert to per-minibatch factor. The limit, positivie infinity, means that running mean/var parameters are "frozen"
+        // Convert to per-minibatch factor. The limit, positive infinity, means that running mean/var parameters are "frozen"
         // that is, do not require updates.
         // The code below special-cases two boundary cases, but those are just the limit cases of the main formula.
-        double numSamples = (double)GetMBLayout()->GetActualNumSamples();
         if (!isfinite(m_normTimeConst))                      // infinite
             return 0;                                        // no new contribution from current minibatch (infinitely long memory)
         else if (m_normTimeConst > 0)                        // not zero
@@ -1720,7 +1746,16 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
     {
         // in inference mode, only use long-term mean and do not update running estimates
         if (!Environment().IsTraining())
-            return 1.0; // (m_blendTimeConst == infinity) estimate is taken 100% from the long-term running estimate
+        {
+            assert(0 < m_samplesSeen);  // something must have been trained
+            return 1.0;             // (m_blendTimeConst == infinity) estimate is taken 100% from the long-term running estimate
+        }
+
+        // Initialization case: only use current minibatch.
+        if (m_samplesSeen == 0)
+        {
+            return 0;
+        }
 
         // convert to blend factor (= weight for running stats)
         // The code below special-cases two boundary cases, but those are just the limit cases of the main formula.
@@ -1736,21 +1771,22 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
     {
+        assert(!m_convertRunningVariance);
         FrameRange fr(Input(0)->GetMBLayout());
 
         Matrix<ElemType> sliceInputValue  = Input(0)->ValueFor(fr);
         const Matrix<ElemType>& scale     = Input(1)->Value();
         const Matrix<ElemType>& bias      = Input(2)->Value();
         Matrix<ElemType>& runMean         = Input(3)->Value();
-        Matrix<ElemType>& runInvStdDev    = Input(4)->Value();
+        Matrix<ElemType>& runVariance     = Input(4)->Value();
         Matrix<ElemType> sliceOutputValue = ValueFor(fr);
 
         assert(scale.GetNumRows() == bias.GetNumRows());
         assert(scale.GetNumCols() == bias.GetNumCols());
         assert(runMean.GetNumRows() == scale.GetNumRows());
         assert(runMean.GetNumCols() == scale.GetNumCols());
-        assert(runMean.GetNumRows() == runInvStdDev.GetNumRows());
-        assert(runMean.GetNumCols() == runInvStdDev.GetNumCols());
+        assert(runMean.GetNumRows() == runVariance.GetNumRows());
+        assert(runMean.GetNumCols() == runVariance.GetNumCols());
 
         // determine the factors from the time constants
         double expAvgFactor = ComputeExpAvgFactor(); // weight for the new MB statistics in the running estimate. The previous value of the running statistics is kept with weight (1-this)
@@ -1758,12 +1794,10 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
 
         m_bnEng->Forward(/*in=*/ sliceInputValue, scale, bias, // (in)
                          expAvgFactor, blendFactor,
-                         runMean, runInvStdDev,                // (in/out) running estimates, updated from the current MB mean/stddev
+                         runMean, runVariance,                 // (in/out) running estimates, updated from the current MB mean/variance
                          /*out=*/ sliceOutputValue,            // (out) batch-normalized output value
                          m_epsilon,
                          *m_saveMean, *m_saveInvStdDev);       // (out) actual interpolated mean/stddev values. Note: unused/empty for blendFactor==1 for CNTK engine
-
-        m_mbCount++;
     }
 
     // Note: This function assumes that inputIndex=0 is called before the others.
@@ -1775,19 +1809,20 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
 
         if (inputIndex == 0) // derivative with respect to the input.
         {
-            auto sliceOutputGrad                 = GradientFor(fr);
-            auto sliceInputValue                 = Input(0)->ValueFor(fr);
-            const Matrix<ElemType>& scale        = Input(1)->Value();
-            const Matrix<ElemType>& bias         = Input(2)->Value();
-            const Matrix<ElemType>& runMean      = Input(3)->Value();
-            const Matrix<ElemType>& runInvStdDev = Input(4)->Value();
+            auto sliceOutputGrad                = GradientFor(fr);
+            auto sliceInputValue                = Input(0)->ValueFor(fr);
+            const Matrix<ElemType>& scale       = Input(1)->Value();
+            const Matrix<ElemType>& bias        = Input(2)->Value();
+            const Matrix<ElemType>& runMean     = Input(3)->Value();
+            const Matrix<ElemType>& runVariance = Input(4)->Value();
 
             auto sliceInputGrad = Input(0)->GradientFor(fr);
             // The mean used in Forward() are either saveMean or runMean.
             // This is decided by the engine, which communicates back the decision by returning
-            // an empty saveMean in case runMean should be used. Likewise for stddev.
-            let& actualMean      = !m_saveMean->IsEmpty()      ? *m_saveMean      : runMean;      // empty if only the running mean is used
-            let& actualInvStdDev = !m_saveInvStdDev->IsEmpty() ? *m_saveInvStdDev : runInvStdDev;
+            // an empty saveMean in case runMean should be used. Likewise for variance / inverted standard deviation.
+            let& actualMean      = !m_saveMean->IsEmpty()      ? *m_saveMean      : runMean;   // empty if only the running mean is used
+            if (m_saveInvStdDev->IsEmpty()) RuntimeError("TODO m_saveInvStdDev <-> runVariance not the same:");
+            let& actualInvStdDev = !m_saveInvStdDev->IsEmpty() ? *m_saveInvStdDev : runVariance;
             m_dScale->Resize(scale); // gradients for scale and bias get stored here
             m_dBias->Resize(bias);
 
@@ -1815,7 +1850,14 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
             grad.SetValue(grad.GetNumRows(), grad.GetNumCols(), grad.GetDeviceId(), m_dBias->Data());
             // BUGBUG: ^^ Also here, this should add the gradient, not overwrite it.
         }
-        // No derivatives with respect to running mean and InvStdDev.
+        // No derivatives with respect to running mean and variance.
+    }
+
+    virtual void EndBackprop() override
+    {
+        auto numSamples = GetMBLayout()->GetActualNumSamples();
+        m_samplesSeen += numSamples;
+        Base::EndBackprop();
     }
 
     virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
@@ -1850,6 +1892,21 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
 
         if (isFinalValidationPass)
         {
+            if (m_convertRunningVariance)
+            {
+                // Input 4 is still inverse standard deviation. We convert it to variance, approximately,
+                // and output a warning.
+                fprintf(stderr, "WARNING: %ls: approximately converting inverse standard deviation into variance when loading pre-CuDNNv5 model\n",
+                        NodeName().c_str());
+                Matrix<ElemType>& runInvStdDev = Input(4)->Value();
+                runInvStdDev.AssignElementPowerOf(runInvStdDev, 2);
+                runInvStdDev.ElementInverse();
+                runInvStdDev += (float) m_epsilon;
+                fprintf(stderr, "--- %ls converted runVariance after loading\n", NodeName().c_str());
+                runInvStdDev.Print();
+                m_convertRunningVariance = false;
+            }
+
             // check inputs
             for (size_t i = 1; i < GetNumInputs(); i++)
             {
@@ -1968,7 +2025,7 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
     // Roughly, this specifies how many samples "worth" is the running statistics,
     // relative to the current minibatch statistics.
     // If 0, only use the current MB statistics. If infinity, use only the running mean, like in inference mode.
-    // The main idea is to estimate the mean/variance as a MAP estimate using the running mean/var as a prrior.
+    // The main idea is to estimate the mean/variance as a MAP estimate using the running mean/var as a prior.
     // This should make the method more robust to the case of very small minibatches,
     // and also provides a meaningful interpretation of inference mode, where only the prior is used.
     // Effectively, this ends up in a linear interpolation of running and minibatch statistics.
@@ -1978,7 +2035,7 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
     // REVIEW alexeyk: if this works, document it properly in Wiki.
     double m_blendTimeConst;
 
-    // Epsilon used to compute inverse std deviation.
+    // Epsilon used to compute inverse standard deviation (m_saveInvStdDev).
     double m_epsilon;
     // Whether to use CNTK or cuDNN BN implementation.
     bool m_useCntkEngine;
@@ -1987,10 +2044,10 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
 
     // --- working variables
 
-    // Minibatch count, used to compute cumulative moving average.
-    size_t m_mbCount;
+    // Samples seen count, used to compute cumulative moving average.
+    size_t m_samplesSeen;
 
-    // Interpolated actual mean/stddev values. Pre-computed on forward pass, also used in gradient computation.
+    // Interpolated actual mean/inverse stddev values. Pre-computed on forward pass, also used in gradient computation.
     shared_ptr<Matrix<ElemType>> m_saveMean;
     shared_ptr<Matrix<ElemType>> m_saveInvStdDev;
     // Temp buffer for scale and bias derivatives. Only used in BackpropTo(), carrying info from first call to subsequent calls.
@@ -1999,6 +2056,8 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
     shared_ptr<Matrix<ElemType>> m_dBias;
 
     std::unique_ptr<BatchNormEngine<ElemType>> m_bnEng;
+
+    bool m_convertRunningVariance;
 };
 
 template class BatchNormalizationNode<float>;
diff --git a/Source/Math/BatchNormalizationEngine.cpp b/Source/Math/BatchNormalizationEngine.cpp
index 3c8e232e343b..c254c8c64211 100644
--- a/Source/Math/BatchNormalizationEngine.cpp
+++ b/Source/Math/BatchNormalizationEngine.cpp
@@ -10,7 +10,7 @@
 namespace Microsoft { namespace MSR { namespace CNTK {
 
 template <class ElemType>
-void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
+void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
                                         Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev)
 {
     assert(in.GetNumRows() == m_inOutT.GetNumElements());
@@ -24,22 +24,22 @@ void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const M
         assert(m_inOutT.GetNumElements() == scale.GetNumRows());
         assert(m_inOutT.GetNumElements() == bias.GetNumRows());
         assert(m_inOutT.GetNumElements() == runMean.GetNumRows());
-        assert(m_inOutT.GetNumElements() == runInvStdDev.GetNumRows());
+        assert(m_inOutT.GetNumElements() == runVariance.GetNumRows());
     }
     else
     {
         assert((m_inOutT.GetNumElements() % scale.GetNumRows()) == 0);
         assert((m_inOutT.GetNumElements() % bias.GetNumRows()) == 0);
         assert((m_inOutT.GetNumElements() % runMean.GetNumRows()) == 0);
-        assert((m_inOutT.GetNumElements() % runInvStdDev.GetNumRows()) == 0);
+        assert((m_inOutT.GetNumElements() % runVariance.GetNumRows()) == 0);
     }
     assert(scale.GetNumCols() == 1);
     assert(bias.GetNumCols() == 1);
     assert(runMean.GetNumCols() == 1);
-    assert(runInvStdDev.GetNumCols() == 1);
+    assert(runVariance.GetNumCols() == 1);
 
     EnsureCompatible();
-    ForwardCore(in, scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
+    ForwardCore(in, scale, bias, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, saveMean, saveInvStdDev);
 
     if (!m_spatial)
     {
@@ -89,10 +89,10 @@ class CntkBatchNormEngine : public BatchNormEngine<ElemType>
             InvalidArgument("CNTK batch normalization supports only cudnn(CHW) layout.");
     }
 
-    void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
+    void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
                      Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
     {
-        in.BatchNormalizationForward(scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
+        in.BatchNormalizationForward(scale, bias, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, saveMean, saveInvStdDev);
     }
 
     void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
diff --git a/Source/Math/BatchNormalizationEngine.h b/Source/Math/BatchNormalizationEngine.h
index b3b1909568ce..d4beb82ae8d6 100644
--- a/Source/Math/BatchNormalizationEngine.h
+++ b/Source/Math/BatchNormalizationEngine.h
@@ -34,7 +34,7 @@ class MATH_API BatchNormEngine
 public:
     virtual ~BatchNormEngine() = default;
 
-    void Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
+    void Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
                  Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev);
 
     void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
@@ -56,7 +56,7 @@ class MATH_API BatchNormEngine
     virtual void EnsureCompatible() = 0;
 
     // saveMean/saveInvStdDev return the actual mean/stddev used for normalization, except for blendFactor=1, these are unused and untouched
-    virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
+    virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
                  Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;
 
     virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp
index 5c0b78381cfc..3056bb204b5d 100644
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@@ -4417,11 +4417,9 @@ void CPUMatrix<ElemType>::AveragePoolingBackward(const CPUMatrix<int>& mpRowCol,
 
 template <class ElemType>
 void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
-                                                    CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runInvStdDev, CPUMatrix<ElemType>& out, double epsilon,
+                                                    CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runVariance, CPUMatrix<ElemType>& out, double epsilon,
                                                     CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const
 {
-    UNUSED(epsilon);
-
     assert((GetNumRows() % scale.GetNumRows()) == 0);
 
     if (expAvgFactor != 0 || blendFactor != 1)
@@ -4431,6 +4429,7 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
     saveInvStdDev.Resize(0, 0);
 
     bool spatial = GetNumRows() != scale.GetNumRows();
+    size_t batchSize = GetNumCols();
     if (spatial)
     {
         size_t spatialSize = GetNumRows() / scale.GetNumRows();
@@ -4440,7 +4439,8 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
             for (long irow = 0; irow < out.GetNumRows(); irow++)
             {
                 size_t imap = irow / spatialSize;
-                out(irow, icol) = scale(imap, 0) * ((*this)(irow, icol) - runMean(imap, 0)) * runInvStdDev(imap, 0) + bias(imap, 0);
+                ElemType stdDev = sqrt(runVariance(imap, 0) * (batchSize - 1) / batchSize + epsilon);
+                out(irow, icol) = scale(imap, 0) * ((*this)(irow, icol) - runMean(imap, 0)) / stdDev + bias(imap, 0);
             }
         }
     }
@@ -4451,7 +4451,8 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
         {
             for (long irow = 0; irow < out.GetNumRows(); irow++)
             {
-                out(irow, icol) = scale(irow, 0) * ((*this)(irow, icol) - runMean(irow, 0)) * runInvStdDev(irow, 0) + bias(irow, 0);
+                ElemType stdDev = sqrt(runVariance(irow, 0) * (batchSize - 1) / batchSize + epsilon);
+                out(irow, icol) = scale(irow, 0) * ((*this)(irow, icol) - runMean(irow, 0)) / stdDev + bias(irow, 0);
             }
         }
     }
diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h
index 2654514ce73d..357d2b7e6780 100644
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@@ -27,8 +27,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
 double logadd(double x, double y);
 
-//To compy with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor
-//convertion is need when passing data between CPUMatrix and C++ matrices
+//To comply with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor
+//conversion is need when passing data between CPUMatrix and C++ matrices
 template <class ElemType>
 class MATH_API CPUMatrix : public BaseMatrix<ElemType>
 {
@@ -78,7 +78,7 @@ class MATH_API CPUMatrix : public BaseMatrix<ElemType>
         return m_numRows * m_numCols * sizeof(ElemType);
     }
 
-    // Returns pointer into underlying data buffer correspoinding to slice-view. This makes it different from method Buffer()
+    // Returns pointer into underlying data buffer corresponding to slice-view. This makes it different from method Buffer()
     ElemType* Data() const
     {
         return Buffer() + m_sliceViewOffset;
@@ -375,7 +375,7 @@ class MATH_API CPUMatrix : public BaseMatrix<ElemType>
     void AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
                                 CPUMatrix<ElemType>& grad) const;
 
-    void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runInvStdDev,
+    void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runStdDev,
                                    CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const;
     void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, double blendFactor, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
                                     CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const;
diff --git a/Source/Math/CntkBatchNormalization.cuh b/Source/Math/CntkBatchNormalization.cuh
index 8593d90446dd..ae1f77221dda 100644
--- a/Source/Math/CntkBatchNormalization.cuh
+++ b/Source/Math/CntkBatchNormalization.cuh
@@ -30,7 +30,7 @@ cudaError_t GetLastCudaError()
     assert(cudaSuccess == prelaunchErr);
     if (prelaunchErr != cudaSuccess)
         return prelaunchErr;
-        
+
 #ifndef NO_SYNC
     cudaError_t executionErr = cudaStreamSynchronize(GetStream());
     assert(cudaSuccess == executionErr);
@@ -149,26 +149,48 @@ void Call(size_t vectorSize, Targs... args)
 }
 
 //--------------------------------------------------------------------
-// Mean and variance computaion
+// Mean and variance computation
 //--------------------------------------------------------------------
 
-// The kernel implements online, parallel and numerically stable algorithm 
-// for computing batch mean and variance (here inverse standard deviation) with one pass over the data.
+// The kernel implements online, parallel and numerically stable algorithm
+// for computing batch mean and variance (and inverse standard deviation) with one pass over the data.
 // It uses algorithms by Knuth/Welford and Chan et al (http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf)
 // In short, algorithm has 2 steps:
-// 1. Each thread strides over the input and computes mean and 
-//    m2 value (used to compute variance at the end) - Welford algorithm.
-// 2. Parallel reduction (Chan algorithm) performed by columns (note that 
+// 1. Each thread strides over the input and computes mean and
+//    m2 value (used to compute variance and inverse standard deviation at the end) - Welford algorithm.
+// 2. Parallel reduction (Chan algorithm) performed by columns (note that
 //    thread block and grid X dimensions go along the vector and Y dimension - along the batch).
 //    As a result, each block has 2 * blockDim.x (mean and inverse stddev) values to write at the end.
-//    
+//
+// Running mean and variance will be averaged according to an exponential
+// averaging factor (expAvgFactor), taking the running statistics with weight
+// (1 - expAvgFactor).
+// Batch mean and inverse standard deviation will be further averaged according
+// to a blending factor (blendFactor), taking the running statistics with
+// weight blendFactor.
+// If (expAvgFactor = 0) && (blendFactor = 1), there is no need to call this
+// function, since there no update based on batch data is involved (inference
+// mode).
+//
+// Averaging into running variables (runMean, runVariance):
+//     expAvgFactor == 0 - use running mean/var instead of the actual batch mean/var.
+// 0 < expAvgFactor <  1 - average running mean/var with actual batch mean/var, e.g.,
+//                         new runMean = expAvgFactor * actual batch mean + (1 - expAvgFactor) * runMean
+//     expAvgFactor == 1 - use actual batch mean/var
+//
+// Blending into batch variables (based on new running statistics computed above):
+//     blendFactor == 1 - use (new) running mean/var instead of the current actual batch mean/var.
+// 0 < blendFactor <  1 - blend new running mean/var with averaged mean/var of the current minibatch, e.g.,
+//                        new xMean = (1 - blendFactor) * actual batch mean + blendFactor * new runMean
+//     blendFactor == 0 - use actual batch mean/var
 template <int BlockDimX, int BlockDimY, int U, typename ElemType>
 __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
                                               const ElemType* x,                         // (in) input data
-                                              double expAvgFactor,
-                                              ElemType* runMean, ElemType* runInvStdDev, // (in/out) running mean/stddev, gets updated with current minibatch
+                                              double expAvgFactor, // TODO why not ElemType? same for the other parameters, functions?
+                                              double blendFactor,
+                                              ElemType* runMean, ElemType* runVariance,  // (in/out) running mean/variance, gets updated with current minibatch
                                               double epsilon,
-                                              ElemType* xMean, ElemType* xInvStdDev)     // (out) this minibatch's mean
+                                              ElemType* xMean, ElemType* xInvStdDev)     // (out) this minibatch's mean and inverse stddev
 {
     static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
     static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
@@ -179,19 +201,22 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
     assert(gridDim.y == 1);
     assert(gridDim.z == 1);
     assert(::isfinite(epsilon) && epsilon > 0);
-    assert(::isfinite(expAvgFactor) && expAvgFactor >= 0);
+    assert(::isfinite(expAvgFactor) && 0 <= expAvgFactor && expAvgFactor <= 1);
+    assert(::isfinite(blendFactor) && 0 <= blendFactor && blendFactor <= 1);
+    assert(expAvgFactor != 0 || blendFactor != 1); // otherwise no need call (no update)
 
     int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
     if (irowSrcBase >= vectorSize)
         return;
     assert(irowSrcBase + U <= vectorSize);
 
-    // --- estimate this minibatch's mean/stddev
+    // --- estimate this minibatch's mean/variance
 
     // first estimate mean over all data for this thread
     int n = 0;
     ElemType mean[U]; // this thread's part of the mean vector (stored as a normalized mean also during accumulation)
-    ElemType m2[U];   // likewise for stdev
+    ElemType m2[U];   // likewise for variance
+    ElemType im2[U];  // and inverse stddev
 #pragma unroll
     for (int k = 0; k < U; k++)
     {
@@ -220,7 +245,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
         psrc += vectorSize * BlockDimY;
     }
 
-    // now reduce minibatch mean/stddev across threads
+    // now reduce minibatch mean/variance across threads
     const int tid = threadIdx.y * BlockDimX + threadIdx.x;
     const int laneId = tid & 0x1f;
     // First, reduce within warp using shuffle.
@@ -245,7 +270,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
         }
     }
 
-    // Storage for each warp in a thread block. First warp ("accumulator") holds 
+    // Storage for each warp in a thread block. First warp ("accumulator") holds
     // final results so it does not need shared memory.
     const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
     __shared__ ElemType meanRes[BlockDimX * U][cwarp - 1];
@@ -267,7 +292,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
     }
     __syncthreads();
 
-    // --- final reduction and update of running mean/stddev
+    // --- final reduction and update of running mean/variance
 
     // Accumulate and write final results.
     // REVIEW alexeyk: see if atomicAdd can be used instead, do perf comparison.
@@ -290,46 +315,44 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
             }
             n = nsum;
         }
-        size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
-        // Store mean and running mean.
-        StoreValues<U>(mean, xMean + idxDstBase);
-        // at this point, minibatch mean has been saved into xMean[]
 
-        // accumulate running mean
-        if (expAvgFactor == 1) // 100% comes from current minibatch, nothing from history
-            StoreValues<U>(mean, runMean + idxDstBase);
-        else
-        {
-            ElemType run[U];
-            LoadValues<U>(runMean + idxDstBase, run);
-#pragma unroll
-            for (int k = 0; k < U; k++)
-                run[k] = expAvgFactor * mean[k] + (1.0 - expAvgFactor) * run[k];
-            StoreValues<U>(run, runMean + idxDstBase);
-        }
-        // at this point, runMean[] has been updated
+        size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
+        ElemType run[U];
+        ElemType x[U];
 
-        // Store inv std dev and its running version.
+        // Compute running mean and batch mean.
+        LoadValues<U>(runMean + idxDstBase, run);
 #pragma unroll
         for (int k = 0; k < U; k++)
         {
-            m2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
+            run[k] = expAvgFactor * mean[k] + (1.0 - expAvgFactor) * run[k];
+            x[k] = blendFactor * run[k] + (1.0 - blendFactor) * mean[k];
         }
-        StoreValues<U>(m2, xInvStdDev + idxDstBase);
-        // at this point, minibatch stddev has been saved into xInvStdDev[]
+        StoreValues<U>(run, runMean + idxDstBase);
+        StoreValues<U>(x, xMean + idxDstBase);
+        // At this point, runMean[] and xMean[] have been updated
 
-        if (expAvgFactor == 1)
-            StoreValues<U>(m2, runInvStdDev + idxDstBase);
-        else
-        {
-            ElemType run[U];
-            LoadValues<U>(runInvStdDev + idxDstBase, run);
+        // Compute running variance and batch inverse standard deviation
+        LoadValues<U>(runVariance + idxDstBase, run);
+        // TODO add back special cases
 #pragma unroll
-            for (int k = 0; k < U; k++)
-                run[k] = expAvgFactor * m2[k] + (1.0 - expAvgFactor) * run[k];
-            StoreValues<U>(run, runInvStdDev + idxDstBase);
+        for (int k = 0; k < U; k++)
+        {
+            // Compute batch inverse standard deviation and variance
+            ElemType runVariance = m2[k] / (batchSize - 1);
+            // Average
+            run[k] = expAvgFactor * runVariance + (1.0 - expAvgFactor) * run[k];
+            // Blend
+            im2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
+            if (blendFactor != 0)
+            {
+                ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(run[k] * (batchSize - 1) / batchSize + epsilon));
+                im2[k] = blendFactor * runInvStdDev + (1.0 - blendFactor) * im2[k];
+            }
         }
-        // at this point, runInvStdDev[] has been updated
+        StoreValues<U>(run, runVariance + idxDstBase);
+        StoreValues<U>(im2, xInvStdDev + idxDstBase);
+        // at this point, runVariance[] xInvStdDev[] have been updated
     }
 }
 
@@ -337,9 +360,10 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
 // but also W and H dimensions.
 // REVIEW alexeyk: is it possible to combine this and previous kernel into a single kernel without hurting performance/readability much?
 template <int BlockDimX, int BlockDimY, int U, typename ElemType>
-__global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatialSize, int batchSize, const ElemType* x, 
-                                                        double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
-                                                        double epsilon, ElemType* xMean, ElemType* xInvStdDev)
+__global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatialSize, int batchSize, const ElemType* x,
+                                                     double expAvgFactor, double blendFactor,
+                                                     ElemType* runMean, ElemType* runVariance,
+                                                     double epsilon, ElemType* xMean, ElemType* xInvStdDev)
 {
     static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
     static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
@@ -350,7 +374,9 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
     assert(gridDim.z == 1);
     assert((spatialSize % U) == 0);
     assert((vectorSize % spatialSize) == 0);
-    assert(::isfinite(expAvgFactor) && expAvgFactor > 0);
+    assert(::isfinite(expAvgFactor) && 0 <= expAvgFactor && expAvgFactor <= 1);
+    assert(::isfinite(blendFactor) && 0 <= blendFactor && blendFactor <= 1);
+    assert(expAvgFactor != 0 || blendFactor != 1); // otherwise no need call (no update)
     assert(::isfinite(epsilon) && epsilon > 0);
 
     int irowSrcBase = blockIdx.x * spatialSize + threadIdx.x * U;
@@ -419,7 +445,7 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
         }
     }
 
-    // Storage for each warp in a thread block. First warp ("accumulator") holds 
+    // Storage for each warp in a thread block. First warp ("accumulator") holds
     // final results so it does not need shared memory.
     const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
     __shared__ ElemType meanRes[U][cwarp - 1];
@@ -471,11 +497,18 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
             m2[0] += m2[k] + d * k * n * dScaled;
         }
 
-        xMean[blockIdx.x] = mean[0];
-        runMean[blockIdx.x] = (expAvgFactor == 1) ? mean[0] : (expAvgFactor * mean[0] + (1.0 - expAvgFactor) * runMean[blockIdx.x]);
-        m2[0] = Operations::RSqrt(static_cast<ElemType>(m2[0] / (batchSize * spatialSize) + epsilon));
-        xInvStdDev[blockIdx.x] = m2[0];
-        runInvStdDev[blockIdx.x] = (expAvgFactor == 1) ? m2[0] : (expAvgFactor * m2[0] + (1.0 - expAvgFactor) * runInvStdDev[blockIdx.x]);
+        // TODO add back special cases
+        runMean[blockIdx.x] = expAvgFactor * mean[0] + (1.0 - expAvgFactor) * runMean[blockIdx.x];
+        xMean[blockIdx.x] = blendFactor * runMean[blockIdx.x] + (1.0 - blendFactor) * mean[0];
+
+        ElemType runV = m2[0] / (batchSize * spatialSize - 1);
+        runVariance[blockIdx.x] = expAvgFactor * runV + (1.0 - expAvgFactor) * runVariance[blockIdx.x];
+        xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<ElemType>(m2[0] / (batchSize * spatialSize) + epsilon));
+        if (blendFactor != 0)
+        {
+            ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] * (batchSize - 1) / batchSize + epsilon));
+            xInvStdDev[blockIdx.x] = blendFactor * runInvStdDev + (1.0 - blendFactor) * xInvStdDev[blockIdx.x];
+        }
     }
 }
 
@@ -488,7 +521,8 @@ struct ComputeBatchMeanAndInvStdDev
     static void Call(size_t vectorSize, size_t batchSize,
                      const ElemType* x,                         // (in) input data
                      double expAvgFactor,
-                     ElemType* runMean, ElemType* runInvStdDev, // (in/out) running mean/stddev, gets updated with current minibatch
+                     double blendFactor,
+                     ElemType* runMean, ElemType* runVariance,  // (in/out) running mean/variance, gets updated with current minibatch
                      double epsilon,
                      ElemType* xMean, ElemType* xInvStdDev,     // (out) actual interpolated mean/stddev that are used to normalize. Returned since needed in backprop.
                      cudaStream_t stream)
@@ -501,8 +535,8 @@ struct ComputeBatchMeanAndInvStdDev
         // Create grid with only one block in y(batch)-dimension as kernel uses striding.
         auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
         kComputeBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
-            static_cast<int>(vectorSize), static_cast<int>(batchSize), 
-            x, expAvgFactor, runMean, runInvStdDev, epsilon, xMean, xInvStdDev);
+            static_cast<int>(vectorSize), static_cast<int>(batchSize),
+            x, expAvgFactor, blendFactor, runMean, runVariance, epsilon, xMean, xInvStdDev);
     }
 };
 
@@ -510,8 +544,8 @@ template <int U>
 struct ComputeSpatialBatchMeanAndInvStdDev
 {
     template <typename ElemType>
-    static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, 
-                        double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
+    static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x,
+                        double expAvgFactor, double blendFactor, ElemType* runMean, ElemType* runVariance,
                         double epsilon, ElemType* xMean, ElemType* xInvStdDev, cudaStream_t stream)
     {
         assert((vectorSize % spatialSize) == 0);
@@ -524,8 +558,8 @@ struct ComputeSpatialBatchMeanAndInvStdDev
         // Each thread block processes a single whole feature map independently (i.e. reduces over W, H and N dimensions).
         auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
         kComputeSpatialBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
-            static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), 
-            x, expAvgFactor, runMean, runInvStdDev,epsilon, xMean, xInvStdDev);
+            static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize),
+            x, expAvgFactor, blendFactor, runMean, runVariance, epsilon, xMean, xInvStdDev);
     }
 };
 
@@ -537,9 +571,13 @@ struct ComputeSpatialBatchMeanAndInvStdDev
 // or Cx1x1 in convolutional case.
 //--------------------------------------------------------------------
 
-template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
-__global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int batchSize, const ElemType* x, ElemType* y,
-    const ElemType* bnScale, const ElemType* bnBias, const ElemType* batchMean, const ElemType* batchInvStdDev)
+template <int BlockDimX, int BlockDimY, bool Spatial, bool NormalizeRunningStats, int U, typename ElemType>
+__global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int batchSize,
+    double epsilon,
+    const ElemType* x, ElemType* y,
+    const ElemType* bnScale, const ElemType* bnBias,
+    const ElemType* runningMean, const ElemType* runningVariance,
+    const ElemType* batchMean, ElemType* batchInvStdDev)
 {
     static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
     static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
@@ -562,6 +600,7 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
     __shared__ ElemType scaleS[BlockDimX * U];
     __shared__ ElemType biasS[BlockDimX * U];
     int offs = threadIdx.x * U;
+
     // REVIEW alexeyk: optimize smem usage, reduce transaction count (is it worth it?).
     if (threadIdx.y == 0)
     {
@@ -571,16 +610,24 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
             for (int k = 0; k < U; k++)
             {
                 int imap = (irowBase + k) / spatialSize;
-                meanS[offs + k] = batchMean[imap];
-                invStdDevS[offs + k] = batchInvStdDev[imap];
+                meanS[offs + k] = NormalizeRunningStats ? runningMean[imap] : batchMean[imap];
+                invStdDevS[offs + k] = NormalizeRunningStats
+                    ? Operations::RSqrt(static_cast<ElemType>(runningVariance[imap] * (batchSize - 1) / batchSize + epsilon))
+                    : batchInvStdDev[imap];
                 scaleS[offs + k] = bnScale[imap];
                 biasS[offs + k] = bnBias[imap];
             }
         }
         else
         {
-            LoadValues<U>(batchMean + irowBase, meanS + offs);
-            LoadValues<U>(batchInvStdDev + irowBase, invStdDevS + offs);
+            LoadValues<U>((NormalizeRunningStats ? runningMean : batchMean) + irowBase, meanS + offs);
+#pragma unroll
+            for (int k = 0; k < U; k++)
+            {
+                invStdDevS[offs + k] = NormalizeRunningStats
+                    ? Operations::RSqrt(static_cast<ElemType>(runningVariance[irowBase + k] * (batchSize - 1) / batchSize + epsilon))
+                    : invStdDevS[offs + k] = batchInvStdDev[irowBase + k];
+            }
             LoadValues<U>(bnScale + irowBase, scaleS + offs);
             LoadValues<U>(bnBias + irowBase, biasS + offs);
         }
@@ -604,11 +651,13 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
     {
         ElemType val[U];
         LoadValues<U>(psrc, val);
+
 #pragma unroll
         for (int k = 0; k < U; k++)
         {
             val[k] = scale[k] * (val[k] - mean[k]) * invStdDev[k] + bias[k];
         }
+
         StoreValues<U>(val, pdst);
     }
 }
@@ -618,29 +667,55 @@ struct NormalizeBatchTraining
 {
     template <typename ElemType>
     static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial,
-                     const ElemType* x, ElemType* y,                            // (in, out) data to normalize -> normalized data
-                     const ElemType* bnScale, const ElemType* bnBias,           // (in) scale/bias to denormalize with
-                     const ElemType* batchMean, const ElemType* batchInvStdDev, // (in) actual mean/stddev to normalize with
+                     bool normalizeRunningStats, double epsilon,
+                     const ElemType* x, ElemType* y,                               // (in, out) data to normalize -> normalized data
+                     const ElemType* bnScale, const ElemType* bnBias,              // (in) scale/bias to denormalize with
+                     const ElemType* runningMean, const ElemType* runningVariance, // (in) running mean/variance
+                     const ElemType* batchMean, ElemType* batchInvStdDev,          // (in) batch mean/stddev to normalize with
                      cudaStream_t stream)
     {
         assert((vectorSize % U) == 0);
 
         const int BlockDimX = 32 / U;
         const int BlockDimY = 4 * U;
+
         auto bdim = dim3(BlockDimX, BlockDimY);
         // Create a grid that has uses striding in y-dimension to cover whole minibatch.
         auto gdim = dim3((unsigned int)RoundUpToMultiple(vectorSize, BlockDimX * U));
         if (spatial)
         {
-            kNormalizeBatchTraining<BlockDimX, BlockDimY, true, U><<<gdim, bdim, 0, stream>>>(
-                (int)vectorSize, (int)spatialSize, (int)batchSize, x, y, bnScale, bnBias,
-                batchMean, batchInvStdDev);
+            if (normalizeRunningStats)
+                kNormalizeBatchTraining<BlockDimX, BlockDimY, true, true, U><<<gdim, bdim, 0, stream>>>(
+                    (int)vectorSize, (int)spatialSize, (int)batchSize,
+                    epsilon,
+                    x, y, bnScale, bnBias,
+                    runningMean, runningVariance,
+                    batchMean, batchInvStdDev);
+            else
+                kNormalizeBatchTraining<BlockDimX, BlockDimY, true, false, U><<<gdim, bdim, 0, stream>>>(
+                    (int)vectorSize, (int)spatialSize, (int)batchSize,
+                    epsilon,
+                    x, y, bnScale, bnBias,
+                    runningMean, runningVariance,
+                    batchMean, batchInvStdDev);
         }
         else
         {
-            kNormalizeBatchTraining<BlockDimX, BlockDimY, false, U><<<gdim, bdim, 0, stream>>>(
-                (int)vectorSize, (int)spatialSize, (int)batchSize, x, y, bnScale, bnBias,
-                batchMean, batchInvStdDev);
+            if (normalizeRunningStats)
+                kNormalizeBatchTraining<BlockDimX, BlockDimY, false, true, U><<<gdim, bdim, 0, stream>>>(
+                    (int)vectorSize, (int)spatialSize, (int)batchSize,
+                    epsilon,
+                    x, y, bnScale, bnBias,
+                    runningMean, runningVariance,
+                    batchMean, batchInvStdDev);
+            else
+                kNormalizeBatchTraining<BlockDimX, BlockDimY, false, false, U><<<gdim, bdim, 0, stream>>>(
+                    (int)vectorSize, (int)spatialSize, (int)batchSize,
+                    epsilon,
+                    x, y, bnScale, bnBias,
+                    runningMean, runningVariance,
+                    batchMean, batchInvStdDev);
+
         }
     }
 };
@@ -654,7 +729,7 @@ struct NormalizeBatchTraining
 
 template <int BlockDimX, int BlockDimY, int U, typename ElemType>
 __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dScale, ElemType* dBias,
-                                                const ElemType* saveMean, const ElemType* saveInvStdDev)
+                                              const ElemType* saveMean, const ElemType* saveInvStdDev)
 {
     static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
     static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
@@ -757,7 +832,7 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
 }
 
 template <int BlockDimX, int BlockDimY, int U, typename ElemType>
-__global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, 
+__global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy,
                                                         ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev)
 {
     static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
diff --git a/Source/Math/CuDnnBatchNormalization.cu b/Source/Math/CuDnnBatchNormalization.cu
index a9fc9943b18a..3dac26fa4f6d 100644
--- a/Source/Math/CuDnnBatchNormalization.cu
+++ b/Source/Math/CuDnnBatchNormalization.cu
@@ -42,7 +42,7 @@ protected:
             InvalidArgument("cuDNN batch normalization supports tensors of max 4 dimensions.");
     }
 
-    void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
+    void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runStdDev,
                      Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
     {
         // REVIEW alexeyk: there might be a way to do this in cuDNN.
@@ -59,14 +59,14 @@ protected:
             saveMean.Resize(0, 0);      // (these are not produced in this case)
             saveInvStdDev.Resize(0, 0);
             CUDNN_CALL(cudnnBatchNormalizationForwardInference(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(out),
-                                                               m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runInvStdDev), epsilon));
+                                                               m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runStdDev), epsilon));
         }
         else
         {
             saveMean.Resize(runMean);
             saveInvStdDev.Resize(runMean);
             CUDNN_CALL(cudnnBatchNormalizationForwardTraining(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in),
-                                                              m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev),
+                                                              m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runStdDev),
                                                               epsilon, ptr(saveMean), ptr(saveInvStdDev)));
         }
     }
@@ -77,14 +77,9 @@ protected:
         UNUSED(blendFactor);  // BUGBUG: It should be used.
         m_inOutCuDnnT.UpdateBatchSize(srcGrad.GetNumCols());
         cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
-        // REVIEW alexeyk: remove once Philly is upgraded to prod version. Also change betaParamDiff to 1 and update CNTK BN engine.
-#if CUDNN_MAJOR >= 5 || (CUDNN_MAJOR == 4 && CUDNN_PATCHLEVEL >= 7)
+        // REVIEW alexeyk: change betaParamDiff to 1 and update CNTK BN engine.
         CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
                                                    m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
-#else
-        CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One,                    m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
-                                                   m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
-#endif
     }
 
 private:
diff --git a/Source/Math/CuDnnCommon.h b/Source/Math/CuDnnCommon.h
index 7fde4bad2fb1..95d48aeba7d2 100644
--- a/Source/Math/CuDnnCommon.h
+++ b/Source/Math/CuDnnCommon.h
@@ -8,6 +8,9 @@
 #include "Basics.h"
 #include "TensorShape.h"
 #include <cudnn.h>
+#if CUDNN_MAJOR < 5
+#error CNTK needs CuDNN version 5.0 or higher, cf. https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cudnn or https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Linux#cudnn.
+#endif
 #include <memory>
 
 namespace Microsoft { namespace MSR { namespace CNTK {
diff --git a/Source/Math/CuDnnConvolutionEngine.cu b/Source/Math/CuDnnConvolutionEngine.cu
index 707866b8047a..873a8eb25bf6 100644
--- a/Source/Math/CuDnnConvolutionEngine.cu
+++ b/Source/Math/CuDnnConvolutionEngine.cu
@@ -138,16 +138,10 @@ public:
         }
 
         // Must use CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING to get the same results as in reference engine.
-#if CUDNN_MAJOR >= 5
         CUDNN_CALL(cudnnSetPoolingNdDescriptor(m_pool,
                                                kind == PoolKind::Max ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
                                                CUDNN_PROPAGATE_NAN,
                                                (int)dims.size(), dims.data(), pad.data(), stride.data()));
-#else
-        CUDNN_CALL(cudnnSetPoolingNdDescriptor(m_pool,
-                                               kind == PoolKind::Max ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
-                                               (int)dims.size(), dims.data(), pad.data(), stride.data()));
-#endif
     }
 
     ~CuDnnPool()
diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu
index 9564237f35d2..fd910a21e122 100644
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@@ -41,7 +41,7 @@
 #define UNCONST(t, c, uc) GPUMatrix<t>& uc = const_cast<GPUMatrix<t>&>(c);
 
 #ifdef _WIN32
-// thread local storage to access the current stream, initalize to default stream
+// thread local storage to access the current stream, initialize to default stream
 __declspec(thread)
 #endif
     cudaStream_t t_stream = cudaStreamDefault;
@@ -62,7 +62,7 @@ cudaStream_t MATH_API GetStream()
     return t_stream;
 }
 
-// Helper macro patterns for elemtwise methods
+// Helper macro patterns for elementwise methods
 #define DEF_ELEMWISE_INPLACE_FUNC(f)                                      \
     template <class ElemType>                                             \
     GPUMatrix<ElemType>& GPUMatrix<ElemType>::Inplace##f()                \
@@ -3163,7 +3163,7 @@ void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol,
 // returns saveMean/saveInvStdDev which are the actual values used to perform the normalization, except for blendFactor 1, in which case they are unused and set to empty
 template <class ElemType>
 void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
-                                                    GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
+                                                    GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runVariance, GPUMatrix<ElemType>& out, double epsilon,
                                                     GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const
 {
     assert((GetNumRows() % scale.GetNumRows()) == 0);
@@ -3172,71 +3172,54 @@ void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& s
     size_t vectorSize = GetNumRows();
     size_t spatialSize = spatial ? (GetNumRows() / scale.GetNumRows()) : 1;
     size_t batchSize = GetNumCols();
+    bool normalizeRunningStats;
 
     assert(0 < vectorSize && vectorSize <= std::numeric_limits<int>::max());
     assert(0 < batchSize  && batchSize  <= std::numeric_limits<int>::max());
 
-    // --- compute data mean/stddev (into saveMean/saveInvStdDev) and update running mean/stddev
     SyncGuard syncGuard;
-    // If expAvgFactor == 0 && blendFactor == 1 then we don't need to compute current minibatch statistics.
     if (expAvgFactor > 0 || blendFactor < 1)
     {
+        // Compute data mean and inverse standard deviation (into saveMean and
+        // saveInvStdDev), and update running mean and variance.
+        normalizeRunningStats = false;
         saveMean.RequireSize(runMean);
         saveInvStdDev.RequireSize(runMean);
         if (spatial)
         {
             Call<ComputeSpatialBatchMeanAndInvStdDev, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, Data(),
-                                                                expAvgFactor, runMean.Data(), runInvStdDev.Data(), epsilon,
+                                                                expAvgFactor, blendFactor,
+                                                                runMean.Data(), runVariance.Data(), epsilon,
                                                                 saveMean.Data(), saveInvStdDev.Data(), GetStream());
         }
         else
         {
             Call<ComputeBatchMeanAndInvStdDev, ElemType>(vectorSize, vectorSize, batchSize, Data(),
-                                                         expAvgFactor, runMean.Data(), runInvStdDev.Data(), epsilon,
+                                                         expAvgFactor, blendFactor,
+                                                         runMean.Data(), runVariance.Data(), epsilon,
                                                          saveMean.Data(), saveInvStdDev.Data(), GetStream());
         }
     }
-    else // not computing new statistics
+    else
     {
+        // With expAvgFactor == 0 and blendFactor == 1 the running statistics
+        // do not need to be updated. CNTK engine in this case returns saveMean
+        // and saveInvStdDev empty, but cuDNN engine does not.
+        normalizeRunningStats = true;
         saveMean.RequireSize(0, 0);
         saveInvStdDev.RequireSize(0, 0);
     }
 
-    // --- apply MAP estimates of mean/stddev (interpolation of data and running mean/stddev) to data
-    // When:
-    //     blendFactor == 1 - use running mean/var instead of the current minibatch mean/var. Note: saveMean/saveInvStdDev are NOT produced.
-    // 0 < blendFactor <  1 - blend running mean/var with mean/var of the current minibatch: saveMean = (1 - blendFactor) * saveMean + blendFactor * runMean
-    //     blendFactor == 0 - use mean/var of the current minibatch.
-    if (blendFactor < 1)
-    {
-        // non-zero blendFactor: interpolate minibatch mean/stddev in-place with running mean/stddev
-        if (blendFactor > 0)
-        {
-            // REVIEW alexeyk: can be rolled into NormalizeBatchTraining to save bandwidth.
-            // TODO: add a 'beta' parameter to ScaleAndAdd()
-            Scale((ElemType)(1 - blendFactor), saveMean);
-            ScaleAndAdd((ElemType)blendFactor, /*in*/ runMean, /*in/out*/ saveMean);
-            Scale((ElemType)(1 - blendFactor), saveInvStdDev);
-            ScaleAndAdd((ElemType)blendFactor, runInvStdDev, saveInvStdDev);
-        }
-        // normalize
-        Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
-                                               Data(), out.Data(),                            // (in, out) data to be normalized -> normalized data
-                                               scale.Data(), bias.Data(),                     // (in) scale/bias to denormalize with
-                                               /*(in)*/saveMean.Data(), saveInvStdDev.Data(), // (in) actual mean/stddev to normalize with
-                                               GetStream());
-    }
-    else // blendFactor == 1: use running mean/stddev only
-    {
-        Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
-                                               Data(), out.Data(),
-                                               scale.Data(), bias.Data(),
-                                               runMean.Data(), runInvStdDev.Data(), GetStream());
-        // CNTK engine returns saveMean and saveInvStdDev empty, but cnDNN engine does not.
-    }
+    Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
+                                           normalizeRunningStats, epsilon,
+                                           Data(), out.Data(),
+                                           scale.Data(), bias.Data(),
+                                           runMean.Data(), runVariance.Data(),
+                                           saveMean.Data(), saveInvStdDev.Data(),
+                                           GetStream());
 }
 
-// saveMean/saveInvStdDev are the interpolated mean/stddev as used in ForwardProp().
+// saveMean/saveInvStdDev are the interpolated mean/inverse standard deviation as used in ForwardProp().
 // For blendFactor=1, they are not used and can be uninitialized or empty.
 template <class ElemType>
 void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h
index b39185d6b6cd..8930af58687b 100644
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@@ -468,7 +468,7 @@ class MATH_API GPUMatrix : public BaseMatrix<ElemType>
     void AveragePoolingBackward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& grad) const;
 
     void BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
-                                   GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
+                                   GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runStdDev, GPUMatrix<ElemType>& out, double epsilon,
                                    GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const;
     void BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
                                     const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
diff --git a/Source/Math/MathCUDA.vcxproj b/Source/Math/MathCUDA.vcxproj
index ae98231191c9..401704daa6a0 100644
--- a/Source/Math/MathCUDA.vcxproj
+++ b/Source/Math/MathCUDA.vcxproj
@@ -30,7 +30,7 @@
         <CuDnnIncPath>$(CUDNN_PATH)\include</CuDnnIncPath>
         <CuDnnLibPath>$(CUDNN_PATH)\lib\x64</CuDnnLibPath>
         <CuDnnLib>cudnn.lib</CuDnnLib>
-        <CuDnnDll>$(CUDNN_PATH)\bin\cudnn64_4.dll</CuDnnDll>
+        <CuDnnDll>$(CUDNN_PATH)\bin\cudnn64_5.dll</CuDnnDll>
       </PropertyGroup>
     </When>
     <Otherwise>
diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp
index 7a2ebecd8c39..f0749765f993 100644
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@@ -4275,7 +4275,7 @@ void Matrix<ElemType>::AveragePoolingBackward(const Matrix<int>& mpRowCol, const
 
 template <class ElemType>
 void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor, 
-                                                 Matrix<ElemType>& runMean, Matrix<ElemType>& runInvStdDev, Matrix<ElemType>& out, double epsilon,
+                                                 Matrix<ElemType>& runMean, Matrix<ElemType>& runStdDev, Matrix<ElemType>& out, double epsilon,
                                                  Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const
 {
     DecideAndMoveToRightDevice(*this, out);
@@ -4284,10 +4284,10 @@ void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale,
     DISPATCH_MATRIX_ON_FLAG(this,
                             this,
                             m_CPUMatrix->BatchNormalizationForward(*(scale.m_CPUMatrix), *(bias.m_CPUMatrix), expAvgFactor, blendFactor,
-                                                                   *(runMean.m_CPUMatrix), *(runInvStdDev.m_CPUMatrix),
+                                                                   *(runMean.m_CPUMatrix), *(runStdDev.m_CPUMatrix),
                                                                    *(out.m_CPUMatrix), epsilon, *(saveMean.m_CPUMatrix), *(saveInvStdDev.m_CPUMatrix)),
                             m_GPUMatrix->BatchNormalizationForward(*(scale.m_GPUMatrix), *(bias.m_GPUMatrix), expAvgFactor, blendFactor,
-                                                                   *(runMean.m_GPUMatrix), *(runInvStdDev.m_GPUMatrix),
+                                                                   *(runMean.m_GPUMatrix), *(runStdDev.m_GPUMatrix),
                                                                    *(out.m_GPUMatrix), epsilon, *(saveMean.m_GPUMatrix), *(saveInvStdDev.m_GPUMatrix)),
                             NOT_IMPLEMENTED,
                             NOT_IMPLEMENTED);
diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
index d1d7a9671e9b..150ef865e7d3 100644
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@@ -501,7 +501,7 @@ class MATH_API Matrix : public MatrixBase
     void AveragePoolingBackward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& grad) const;
 
     void BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor,
-                                   Matrix<ElemType>& runMean, Matrix<ElemType>& runInvStdDev, Matrix<ElemType>& out, double epsilon,
+                                   Matrix<ElemType>& runMean, Matrix<ElemType>& runStdDev, Matrix<ElemType>& out, double epsilon,
                                    Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const;
     void BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, double blendFactor, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
                                     Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const;
diff --git a/Source/Math/NoGPU.cpp b/Source/Math/NoGPU.cpp
index c1604583597b..55596854c78d 100644
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@@ -1827,7 +1827,7 @@ void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol,
 
 template <class ElemType>
 void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor, 
-                                                    GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
+                                                    GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runStdDev, GPUMatrix<ElemType>& out, double epsilon,
                                                     GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const
 {
 }
diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
index 58b233379dde..5367c17f3717 100644
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@@ -352,6 +352,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
             "or an explicit learning rate must be specified in config for the starting epoch.");
     }
 
+    // TODO this assumes training is picked up with nodes with zero parameters
     double prevDropoutRate = 0;
     double prevNormalizationTimeConstant = 0;
     double prevNormalizationBlendTimeConstant = 0;
diff --git a/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.cntk b/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.cntk
new file mode 100644
index 000000000000..c3df1a844b8b
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.cntk
@@ -0,0 +1,81 @@
+rootDir = ".."
+
+configDir = "$rootDir$/Config"
+dataDir   = "$rootDir$/Data"
+outputDir = "$rootDir$/Output"
+modelDir  = "$outputDir$/Models"
+
+deviceId = 0
+
+command = train:test
+
+precision = "float"
+modelPath = "$modelDir$/01_OneHidden"
+
+numMBsToShowResult = 500
+traceLevel = 1
+
+batchNormalizationEngine = "testMustOverrideBatchNormalizationEngine"
+
+#######################################
+#  TRAINING CONFIG                    #
+#######################################
+
+train = [
+    action = "train"
+
+    NDLNetworkBuilder = [
+        initOnCPUOnly = true
+        networkDescription = "$ConfigDir$/01_OneHidden.ndl"
+    ]
+
+    SGD = [
+        epochSize = 60000
+        minibatchSize = 32
+        learningRatesPerSample = 0.003125
+        momentumAsTimeConstant = 0
+        maxEpochs = 3
+    ]
+
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        # See ../REAMDE.md for details on getting the data (Train-28x28_cntk_text.txt).
+        file = "$DataDir$/Train-28x28_cntk_text.txt"
+        input = [
+            features = [
+                dim = 784
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]   
+]
+
+#######################################
+#  TEST CONFIG                        #
+#######################################
+
+test = [
+    action = "test"
+    minibatchSize = 1024    # reduce this if you run out of memory
+
+    evalNodeNames = ce:errs:top5Errs
+
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Test-28x28_cntk_text.txt"
+        input = [
+            features = [
+                dim = 784
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]
+]
diff --git a/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.ndl b/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.ndl
new file mode 100644
index 000000000000..6a51e0aca92e
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.ndl
@@ -0,0 +1,39 @@
+run = DNN
+
+DNN = [
+    featDim = 784
+    labelDim = 10
+    hiddenDim = 200
+
+    features = InputValue(featDim)
+    featScale = Constant(0.00390625)
+    featScaled = Scale(featScale, features)
+    labels = InputValue(labelDim)
+
+    DNNLayer(inDim, outDim, x, parmScale) = [
+        W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale, initOnCPUOnly=true)
+        b = LearnableParameter(outDim, 1,     init="uniform", initValueScale=parmScale, initOnCPUOnly=true)
+        t = Times(W, x)
+        z = Plus(t, b)
+    ]
+
+    h1 = DNNLayer(featDim, hiddenDim, featScaled, 1)
+
+    b = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0) 
+    sc = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 1) 
+    m = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    y = BatchNormalization(h1, sc, b, m, var, eval=false, spatial=false, normalizationTimeConstant=64, imageLayout=cudnn, engine=$batchNormalizationEngine$)
+
+    ol = DNNLayer(hiddenDim, labelDim, y, 1)
+
+    ce = CrossEntropyWithSoftmax(labels, ol)
+    errs = ErrorPrediction(labels, ol)
+    top5Errs = ErrorPrediction(labels, ol, Const(5), tag="eval")  # only used in testing
+
+    FeatureNodes = (features)
+    LabelNodes = (labels)
+    CriterionNodes = (ce)
+    EvalNodes = (errs)
+    OutputNodes = (ol)
+]
diff --git a/Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/baseline.linux.txt b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/baseline.linux.txt
new file mode 100644
index 000000000000..1333ed77b7e1
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/baseline.linux.txt
@@ -0,0 +1 @@
+TODO
diff --git a/Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/baseline.windows.txt b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/baseline.windows.txt
new file mode 100644
index 000000000000..0339f0794bd7
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/baseline.windows.txt
@@ -0,0 +1,480 @@
+CPU info:
+    CPU Model Name: Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz
+    Hardware threads: 8
+    Total Memory: 33417320 kB
+-------------------------------------------------------------------
+=== Running /cygdrive/c/Users/mahilleb/Repos/CNTK/x64/release/cntk.exe configFile=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial/01_OneHidden.cntk currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu DeviceId=0 timestamping=true batchNormalizationEngine=cudnn
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Aug 22 2016 17:36:51
+		Last modified date: Fri Aug 19 10:26:01 2016
+		Build type: Release
+		Build target: GPU
+		With 1bit-SGD: yes
+		Math lib: mkl
+		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+		CUB_PATH: C:\R\cub-1.4.1
+		CUDNN_PATH: C:\R\cudnn-7.5-windows10-x64-v5.0-ga\cuda
+		Build Branch: mahilleb/CuDnn5Test
+		Build SHA1: db500985aff6d7d67b90c1d0dedcbcd7f8ae7b96 (modified)
+		Built by mahilleb on mahilleb42
+		Build Path: C:\Users\mahilleb\Repos\CNTK\Source\CNTK\
+-------------------------------------------------------------------
+Changed current directory to C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData
+08/22/2016 16:46:32: -------------------------------------------------------------------
+08/22/2016 16:46:32: Build info: 
+
+08/22/2016 16:46:32: 		Built time: Aug 22 2016 17:36:51
+08/22/2016 16:46:32: 		Last modified date: Fri Aug 19 10:26:01 2016
+08/22/2016 16:46:32: 		Build type: Release
+08/22/2016 16:46:32: 		Build target: GPU
+08/22/2016 16:46:32: 		With 1bit-SGD: yes
+08/22/2016 16:46:32: 		Math lib: mkl
+08/22/2016 16:46:32: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+08/22/2016 16:46:32: 		CUB_PATH: C:\R\cub-1.4.1
+08/22/2016 16:46:32: 		CUDNN_PATH: C:\R\cudnn-7.5-windows10-x64-v5.0-ga\cuda
+08/22/2016 16:46:32: 		Build Branch: mahilleb/CuDnn5Test
+08/22/2016 16:46:32: 		Build SHA1: db500985aff6d7d67b90c1d0dedcbcd7f8ae7b96 (modified)
+08/22/2016 16:46:32: 		Built by mahilleb on mahilleb42
+08/22/2016 16:46:32: 		Build Path: C:\Users\mahilleb\Repos\CNTK\Source\CNTK\
+08/22/2016 16:46:32: -------------------------------------------------------------------
+08/22/2016 16:46:32: -------------------------------------------------------------------
+08/22/2016 16:46:32: GPU info:
+
+08/22/2016 16:46:32: 		Device[0]: cores = 960; computeCapability = 5.0; type = "Quadro M2000M"; memory = 4096 MB
+08/22/2016 16:46:32: -------------------------------------------------------------------
+
+08/22/2016 16:46:32: Running on mahilleb42 at 2016/08/22 16:46:32
+08/22/2016 16:46:32: Command line: 
+C:\Users\mahilleb\Repos\CNTK\x64\release\cntk.exe  configFile=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial/01_OneHidden.cntk  currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData  RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu  DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData  ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial  OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu  DeviceId=0  timestamping=true  batchNormalizationEngine=cudnn
+
+
+
+08/22/2016 16:46:32: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/22/2016 16:46:32: rootDir = ".."
+configDir = "$rootDir$/Config"
+dataDir   = "$rootDir$/Data"
+outputDir = "$rootDir$/Output"
+modelDir  = "$outputDir$/Models"
+deviceId = 0
+command = train:test
+precision = "float"
+modelPath = "$modelDir$/01_OneHidden"
+numMBsToShowResult = 500
+traceLevel = 1
+batchNormalizationEngine = "testMustOverrideBatchNormalizationEngine"
+train = [
+    action = "train"
+    NDLNetworkBuilder = [
+        initOnCPUOnly = true
+        networkDescription = "$ConfigDir$/01_OneHidden.ndl"
+    ]
+    SGD = [
+        epochSize = 60000
+        minibatchSize = 32
+        learningRatesPerSample = 0.003125
+        momentumAsTimeConstant = 0
+        maxEpochs = 3
+    ]
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Train-28x28_cntk_text.txt"
+        input = [
+            features = [
+                dim = 784
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]   
+]
+test = [
+    action = "test"
+minibatchSize = 1024    
+    evalNodeNames = ce:errs:top5Errs
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Test-28x28_cntk_text.txt"
+        input = [
+            features = [
+                dim = 784
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]
+]
+currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData
+RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu
+DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData
+ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial
+OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu
+DeviceId=0
+timestamping=true
+batchNormalizationEngine=cudnn
+
+08/22/2016 16:46:32: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+08/22/2016 16:46:32: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/22/2016 16:46:32: rootDir = ".."
+configDir = "../Config"
+dataDir   = "../Data"
+outputDir = "../Output"
+modelDir  = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu/Models"
+deviceId = 0
+command = train:test
+precision = "float"
+modelPath = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu/Models/01_OneHidden"
+numMBsToShowResult = 500
+traceLevel = 1
+batchNormalizationEngine = "testMustOverrideBatchNormalizationEngine"
+train = [
+    action = "train"
+    NDLNetworkBuilder = [
+        initOnCPUOnly = true
+        networkDescription = "C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial/01_OneHidden.ndl"
+    ]
+    SGD = [
+        epochSize = 60000
+        minibatchSize = 32
+        learningRatesPerSample = 0.003125
+        momentumAsTimeConstant = 0
+        maxEpochs = 3
+    ]
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData/Train-28x28_cntk_text.txt"
+        input = [
+            features = [
+                dim = 784
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]   
+]
+test = [
+    action = "test"
+minibatchSize = 1024    
+    evalNodeNames = ce:errs:top5Errs
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData/Test-28x28_cntk_text.txt"
+        input = [
+            features = [
+                dim = 784
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]
+]
+currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData
+RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu
+DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData
+ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial
+OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu
+DeviceId=0
+timestamping=true
+batchNormalizationEngine=cudnn
+
+08/22/2016 16:46:32: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+08/22/2016 16:46:32: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: 01_OneHidden.cntk:batchNormalizationEngine=cudnn
+configparameters: 01_OneHidden.cntk:command=train:test
+configparameters: 01_OneHidden.cntk:configDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial
+configparameters: 01_OneHidden.cntk:currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData
+configparameters: 01_OneHidden.cntk:dataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData
+configparameters: 01_OneHidden.cntk:deviceId=0
+configparameters: 01_OneHidden.cntk:modelDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu/Models
+configparameters: 01_OneHidden.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu/Models/01_OneHidden
+configparameters: 01_OneHidden.cntk:numMBsToShowResult=500
+configparameters: 01_OneHidden.cntk:outputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu
+configparameters: 01_OneHidden.cntk:precision=float
+configparameters: 01_OneHidden.cntk:rootDir=..
+configparameters: 01_OneHidden.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu
+configparameters: 01_OneHidden.cntk:test=[
+    action = "test"
+minibatchSize = 1024    
+    evalNodeNames = ce:errs:top5Errs
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData/Test-28x28_cntk_text.txt"
+        input = [
+            features = [
+                dim = 784
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]
+]
+
+configparameters: 01_OneHidden.cntk:timestamping=true
+configparameters: 01_OneHidden.cntk:traceLevel=1
+configparameters: 01_OneHidden.cntk:train=[
+    action = "train"
+    NDLNetworkBuilder = [
+        initOnCPUOnly = true
+        networkDescription = "C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial/01_OneHidden.ndl"
+    ]
+    SGD = [
+        epochSize = 60000
+        minibatchSize = 32
+        learningRatesPerSample = 0.003125
+        momentumAsTimeConstant = 0
+        maxEpochs = 3
+    ]
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData/Train-28x28_cntk_text.txt"
+        input = [
+            features = [
+                dim = 784
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]   
+]
+
+08/22/2016 16:46:32: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/22/2016 16:46:32: Commands: train test
+08/22/2016 16:46:32: Precision = "float"
+08/22/2016 16:46:32: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu/Models/01_OneHidden
+08/22/2016 16:46:32: CNTKCommandTrainInfo: train : 3
+08/22/2016 16:46:32: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3
+
+08/22/2016 16:46:32: ##############################################################################
+08/22/2016 16:46:32: #                                                                            #
+08/22/2016 16:46:32: # Action "train"                                                             #
+08/22/2016 16:46:32: #                                                                            #
+08/22/2016 16:46:32: ##############################################################################
+
+08/22/2016 16:46:32: CNTKCommandTrainBegin: train
+NDLBuilder Using GPU 0
+
+08/22/2016 16:46:32: Creating virgin network.
+Node 'featScale' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 0.000000.
+Node 'h1.W' (LearnableParameter operation): Initializing Parameter[200 x 784] <- 0.000000.
+Node 'h1.b' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 0.000000.
+Node 'b' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 0.000000.
+Node 'sc' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 0.000000.
+Node 'm' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 0.000000.
+Node 'var' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 0.000000.
+Node 'ol.W' (LearnableParameter operation): Initializing Parameter[10 x 200] <- 0.000000.
+Node 'ol.b' (LearnableParameter operation): Initializing Parameter[10 x 1] <- 0.000000.
+Node 'unnamed32' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 0.000000.
+Node 'featScale' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 0.003906.
+Node 'featScale' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 0.003906.
+Node 'unnamed32' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 5.000000.
+Node 'featScale' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 0.003906.
+Node 'h1.W' (LearnableParameter operation): Initializing Parameter[200 x 784] <- uniform(seed=1, range=0.050000*1.000000, onCPU=true).
+Node 'h1.b' (LearnableParameter operation): Initializing Parameter[200 x 1] <- uniform(seed=2, range=0.050000*1.000000, onCPU=true).
+Node 'b' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 0.000000.
+Node 'sc' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 1.000000.
+Node 'm' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 0.000000.
+Node 'var' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 0.000000.
+Node 'ol.W' (LearnableParameter operation): Initializing Parameter[10 x 200] <- uniform(seed=3, range=0.050000*1.000000, onCPU=true).
+Node 'ol.b' (LearnableParameter operation): Initializing Parameter[10 x 1] <- uniform(seed=4, range=0.050000*1.000000, onCPU=true).
+
+Post-processing network...
+
+4 roots:
+	ce = CrossEntropyWithSoftmax()
+	errs = ErrorPrediction()
+	ol.z = Plus()
+	top5Errs = ErrorPrediction()
+
+Validating network. 21 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [10 x *]
+Validating --> ol.W = LearnableParameter() :  -> [10 x 200]
+Validating --> h1.W = LearnableParameter() :  -> [200 x 784]
+Validating --> featScale = LearnableParameter() :  -> [1 x 1]
+Validating --> features = InputValue() :  -> [784 x *]
+Validating --> featScaled = ElementTimes (featScale, features) : [1 x 1], [784 x *] -> [784 x 1 x *]
+Validating --> h1.t = Times (h1.W, featScaled) : [200 x 784], [784 x 1 x *] -> [200 x 1 x *]
+Validating --> h1.b = LearnableParameter() :  -> [200 x 1]
+Validating --> h1.z = Plus (h1.t, h1.b) : [200 x 1 x *], [200 x 1] -> [200 x 1 x *]
+Validating --> sc = LearnableParameter() :  -> [200 x 1]
+Validating --> b = LearnableParameter() :  -> [200 x 1]
+Validating --> m = LearnableParameter() :  -> [200 x 1]
+Validating --> var = LearnableParameter() :  -> [200 x 1]
+Validating --> y = BatchNormalization (h1.z, sc, b, m, var) : [200 x 1 x *], [200 x 1], [200 x 1], [200 x 1], [200 x 1] -> [200 x 1 x *]
+Validating --> ol.t = Times (ol.W, y) : [10 x 200], [200 x 1 x *] -> [10 x 1 x *]
+Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
+Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *], [10 x 1] -> [10 x 1 x *]
+Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> errs = ErrorPrediction (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> unnamed32 = LearnableParameter() :  -> [1 x 1]
+Validating --> top5Errs = ErrorPrediction (labels, ol.z, unnamed32) : [10 x *], [10 x 1 x *], [1 x 1] -> [1]
+
+Validating network. 9 nodes to process in pass 2.
+
+
+Validating network, final pass.
+
+
+Using cuDNN batch normalization engine.
+
+
+13 out of 21 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+08/22/2016 16:46:33: Created model with 21 nodes on GPU 0.
+
+08/22/2016 16:46:33: Training criterion node(s):
+08/22/2016 16:46:33: 	ce = CrossEntropyWithSoftmax
+
+08/22/2016 16:46:33: Evaluation criterion node(s):
+08/22/2016 16:46:33: 	top5Errs = ErrorPrediction
+08/22/2016 16:46:33: 	errs = ErrorPrediction
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Memory Sharing: Out of 33 matrices, 8 are shared as 4, and 25 are not shared.
+
+	{ h1.W : [200 x 784] (gradient)
+	  h1.z : [200 x 1 x *] }
+	{ ol.W : [10 x 200] (gradient)
+	  ol.z : [10 x 1 x *] (gradient) }
+	{ h1.z : [200 x 1 x *] (gradient)
+	  ol.t : [10 x 1 x *] }
+	{ ol.t : [10 x 1 x *] (gradient)
+	  sc : [200 x 1] (gradient) }
+
+
+08/22/2016 16:46:33: Training 159410 parameters in 6 out of 6 parameter tensors and 12 nodes with gradient:
+
+08/22/2016 16:46:33: 	Node 'b' (LearnableParameter operation) : [200 x 1]
+08/22/2016 16:46:33: 	Node 'h1.W' (LearnableParameter operation) : [200 x 784]
+08/22/2016 16:46:33: 	Node 'h1.b' (LearnableParameter operation) : [200 x 1]
+08/22/2016 16:46:33: 	Node 'ol.W' (LearnableParameter operation) : [10 x 200]
+08/22/2016 16:46:33: 	Node 'ol.b' (LearnableParameter operation) : [10 x 1]
+08/22/2016 16:46:33: 	Node 'sc' (LearnableParameter operation) : [200 x 1]
+
+08/22/2016 16:46:33: No PreCompute nodes found, or all already computed. Skipping pre-computation step.
+
+08/22/2016 16:46:33: Starting Epoch 1: learning rate per sample = 0.003125  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+BlockRandomizer::StartEpoch: epoch 0: frames [0..60000] (first sequence at sample 0), data subset 0 of 1
+
+08/22/2016 16:46:33: Starting minibatch loop.
+08/22/2016 16:46:35:  Epoch[ 1 of 3]-Minibatch[   1- 500, 26.67%]: ce = 0.46662509 * 16000; top5Errs = 1.306% * 16000; errs = 13.969% * 16000; time = 2.5768s; samplesPerSecond = 6209.3
+08/22/2016 16:46:37:  Epoch[ 1 of 3]-Minibatch[ 501-1000, 53.33%]: ce = 0.39357101 * 16000; top5Errs = 0.794% * 16000; errs = 11.369% * 16000; time = 1.3959s; samplesPerSecond = 11461.8
+08/22/2016 16:46:38:  Epoch[ 1 of 3]-Minibatch[1001-1500, 80.00%]: ce = 0.37906537 * 16000; top5Errs = 0.769% * 16000; errs = 11.100% * 16000; time = 1.3856s; samplesPerSecond = 11547.1
+08/22/2016 16:46:39: Finished Epoch[ 1 of 3]: [Training] ce = 0.40404635 * 60000; top5Errs = 0.920% * 60000; errs = 11.822% * 60000; totalSamplesSeen = 60000; learningRatePerSample = 0.003125; epochTime=6.40517s
+08/22/2016 16:46:39: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu/Models/01_OneHidden.1'
+
+08/22/2016 16:46:39: Starting Epoch 2: learning rate per sample = 0.003125  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+BlockRandomizer::StartEpoch: epoch 1: frames [60000..120000] (first sequence at sample 60000), data subset 0 of 1
+
+08/22/2016 16:46:39: Starting minibatch loop.
+08/22/2016 16:46:40:  Epoch[ 2 of 3]-Minibatch[   1- 500, 26.67%]: ce = 0.34492102 * 16000; top5Errs = 0.613% * 16000; errs = 10.225% * 16000; time = 1.3936s; samplesPerSecond = 11480.9
+08/22/2016 16:46:42:  Epoch[ 2 of 3]-Minibatch[ 501-1000, 53.33%]: ce = 0.34236395 * 16000; top5Errs = 0.644% * 16000; errs = 10.144% * 16000; time = 1.3927s; samplesPerSecond = 11488.2
+08/22/2016 16:46:43:  Epoch[ 2 of 3]-Minibatch[1001-1500, 80.00%]: ce = 0.36281952 * 16000; top5Errs = 0.800% * 16000; errs = 10.331% * 16000; time = 1.3694s; samplesPerSecond = 11684.2
+08/22/2016 16:46:44: Finished Epoch[ 2 of 3]: [Training] ce = 0.34606566 * 60000; top5Errs = 0.663% * 60000; errs = 10.123% * 60000; totalSamplesSeen = 120000; learningRatePerSample = 0.003125; epochTime=5.20706s
+08/22/2016 16:46:44: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu/Models/01_OneHidden.2'
+
+08/22/2016 16:46:44: Starting Epoch 3: learning rate per sample = 0.003125  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+BlockRandomizer::StartEpoch: epoch 2: frames [120000..180000] (first sequence at sample 120000), data subset 0 of 1
+
+08/22/2016 16:46:44: Starting minibatch loop.
+08/22/2016 16:46:46:  Epoch[ 3 of 3]-Minibatch[   1- 500, 26.67%]: ce = 0.33230911 * 16000; top5Errs = 0.581% * 16000; errs = 9.469% * 16000; time = 1.3521s; samplesPerSecond = 11833.7
+08/22/2016 16:46:47:  Epoch[ 3 of 3]-Minibatch[ 501-1000, 53.33%]: ce = 0.32444919 * 16000; top5Errs = 0.531% * 16000; errs = 9.494% * 16000; time = 1.3505s; samplesPerSecond = 11847.9
+08/22/2016 16:46:48:  Epoch[ 3 of 3]-Minibatch[1001-1500, 80.00%]: ce = 0.33893469 * 16000; top5Errs = 0.631% * 16000; errs = 9.588% * 16000; time = 1.3574s; samplesPerSecond = 11786.9
+08/22/2016 16:46:49: Finished Epoch[ 3 of 3]: [Training] ce = 0.33093525 * 60000; top5Errs = 0.582% * 60000; errs = 9.548% * 60000; totalSamplesSeen = 180000; learningRatePerSample = 0.003125; epochTime=5.12492s
+08/22/2016 16:46:49: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu/Models/01_OneHidden'
+08/22/2016 16:46:50: CNTKCommandTrainEnd: train
+
+08/22/2016 16:46:50: Action "train" complete.
+
+
+08/22/2016 16:46:50: ##############################################################################
+08/22/2016 16:46:50: #                                                                            #
+08/22/2016 16:46:50: # Action "test"                                                              #
+08/22/2016 16:46:50: #                                                                            #
+08/22/2016 16:46:50: ##############################################################################
+
+INFO: y: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model
+
+Post-processing network...
+
+3 roots:
+	ce = CrossEntropyWithSoftmax()
+	errs = ErrorPrediction()
+	top5Errs = ErrorPrediction()
+
+Validating network. 21 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [10 x *1]
+Validating --> ol.W = LearnableParameter() :  -> [10 x 200]
+Validating --> h1.W = LearnableParameter() :  -> [200 x 784]
+Validating --> featScale = LearnableParameter() :  -> [1 x 1]
+Validating --> features = InputValue() :  -> [784 x *1]
+Validating --> featScaled = ElementTimes (featScale, features) : [1 x 1], [784 x *1] -> [784 x 1 x *1]
+Validating --> h1.t = Times (h1.W, featScaled) : [200 x 784], [784 x 1 x *1] -> [200 x 1 x *1]
+Validating --> h1.b = LearnableParameter() :  -> [200 x 1]
+Validating --> h1.z = Plus (h1.t, h1.b) : [200 x 1 x *1], [200 x 1] -> [200 x 1 x *1]
+Validating --> sc = LearnableParameter() :  -> [200 x 1]
+Validating --> b = LearnableParameter() :  -> [200 x 1]
+Validating --> m = LearnableParameter() :  -> [200 x 1]
+Validating --> var = LearnableParameter() :  -> [200 x 1]
+Validating --> y = BatchNormalization (h1.z, sc, b, m, var) : [200 x 1 x *1], [200 x 1], [200 x 1], [200 x 1], [200 x 1] -> [200 x 1 x *1]
+Validating --> ol.t = Times (ol.W, y) : [10 x 200], [200 x 1 x *1] -> [10 x 1 x *1]
+Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
+Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *1], [10 x 1] -> [10 x 1 x *1]
+Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> errs = ErrorPrediction (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> unnamed32 = LearnableParameter() :  -> [1 x 1]
+Validating --> top5Errs = ErrorPrediction (labels, ol.z, unnamed32) : [10 x *1], [10 x 1 x *1], [1 x 1] -> [1]
+
+Validating network. 9 nodes to process in pass 2.
+
+
+Validating network, final pass.
+
+
+Using cuDNN batch normalization engine.
+
+
+13 out of 21 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Memory Sharing: Out of 21 matrices, 0 are shared as 0, and 21 are not shared.
+
+
+BlockRandomizer::StartEpoch: epoch 0: frames [0..10000] (first sequence at sample 0), data subset 0 of 1
+08/22/2016 16:46:50: Minibatch[1-10]: ce = 0.29474274 * 10000; errs = 7.990% * 10000; top5Errs = 0.540% * 10000
+08/22/2016 16:46:50: Final Results: Minibatch[1-10]: ce = 0.29474274 * 10000; perplexity = 1.34278087; errs = 7.990% * 10000; top5Errs = 0.540% * 10000
+
+08/22/2016 16:46:50: Action "test" complete.
+
+08/22/2016 16:46:50: __COMPLETED__
\ No newline at end of file
diff --git a/Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/run-test b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/run-test
new file mode 100755
index 000000000000..5e8e9c1ae362
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/run-test
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+. $TEST_DIR/../run-test-common
+
+OriginalTestDir=../CuDNN
+
+(cd $TEST_DIR/$OriginalTestDir && md5sum baseline*) | (cd $TEST_DIR && md5sum --status -c -)
+if [ $? != 0 ]; then
+  echo Error: Baselines must match original test. Copy from $OriginalTestDir.
+  exit 1
+fi
+
+cntkrun 01_OneHidden.cntk batchNormalizationEngine=cntk
+ExitCode=$?
+
+# Delete the test data if copied
+[[ "$Copied" -eq "1" ]] && rm -rf "$DataDir"
+
+exit $ExitCode
diff --git a/Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/testcases.yml b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/testcases.yml
new file mode 100644
index 000000000000..51c260b46db6
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/testcases.yml
@@ -0,0 +1,38 @@
+dataDir: ../../../../../Examples/Image/MNIST/Data
+
+tags:
+    # CPU training for BatchNormalization not supported.
+    - bvt-e (build_sku=='gpu') and (device=='gpu') and (flavor=='release')
+    - nightly-e (build_sku=='gpu') and (device=='gpu')
+
+testCases:
+  CNTK Run must be completed:
+    patterns:
+      - __COMPLETED__
+
+  Must train epochs in exactly same order and parameters:
+    patterns:
+      - Starting Epoch {{integer}}
+      - learning rate per sample = {{float}}
+      - momentum = {{float}}
+
+  Epochs must be finished with expected results:
+    patterns:
+      - Finished Epoch[{{integer}} of {{integer}}]
+      - ce = {{float,tolerance=.1%}} * {{integer}}
+      - errs = {{float,tolerance=.1%}}% * {{integer}}
+      - totalSamplesSeen = {{integer}}
+      - learningRatePerSample = {{float,tolerance=0.001%}}
+
+  Per-minibatch training results must match:
+    patterns:
+      - Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
+      - ce = {{float,tolerance=.1%}} * {{integer}}
+      - errs = {{float,tolerance=.1%}}% * {{integer}}
+
+  Final test results must match:
+    patterns:
+      - "Final Results: Minibatch[{{integer}}-{{integer}}]"
+      - top5Errs = {{float,tolerance=.1%}}% * {{integer}}
+      - errs = {{float,tolerance=.1%}}% * {{integer}}
+      - ce = {{float,tolerance=.1%}} * {{integer}}
diff --git a/Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/baseline.linux.txt b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/baseline.linux.txt
new file mode 100644
index 000000000000..1333ed77b7e1
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/baseline.linux.txt
@@ -0,0 +1 @@
+TODO
diff --git a/Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/baseline.windows.txt b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/baseline.windows.txt
new file mode 100644
index 000000000000..0339f0794bd7
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/baseline.windows.txt
@@ -0,0 +1,480 @@
+CPU info:
+    CPU Model Name: Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz
+    Hardware threads: 8
+    Total Memory: 33417320 kB
+-------------------------------------------------------------------
+=== Running /cygdrive/c/Users/mahilleb/Repos/CNTK/x64/release/cntk.exe configFile=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial/01_OneHidden.cntk currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu DeviceId=0 timestamping=true batchNormalizationEngine=cudnn
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Aug 22 2016 17:36:51
+		Last modified date: Fri Aug 19 10:26:01 2016
+		Build type: Release
+		Build target: GPU
+		With 1bit-SGD: yes
+		Math lib: mkl
+		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+		CUB_PATH: C:\R\cub-1.4.1
+		CUDNN_PATH: C:\R\cudnn-7.5-windows10-x64-v5.0-ga\cuda
+		Build Branch: mahilleb/CuDnn5Test
+		Build SHA1: db500985aff6d7d67b90c1d0dedcbcd7f8ae7b96 (modified)
+		Built by mahilleb on mahilleb42
+		Build Path: C:\Users\mahilleb\Repos\CNTK\Source\CNTK\
+-------------------------------------------------------------------
+Changed current directory to C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData
+08/22/2016 16:46:32: -------------------------------------------------------------------
+08/22/2016 16:46:32: Build info: 
+
+08/22/2016 16:46:32: 		Built time: Aug 22 2016 17:36:51
+08/22/2016 16:46:32: 		Last modified date: Fri Aug 19 10:26:01 2016
+08/22/2016 16:46:32: 		Build type: Release
+08/22/2016 16:46:32: 		Build target: GPU
+08/22/2016 16:46:32: 		With 1bit-SGD: yes
+08/22/2016 16:46:32: 		Math lib: mkl
+08/22/2016 16:46:32: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+08/22/2016 16:46:32: 		CUB_PATH: C:\R\cub-1.4.1
+08/22/2016 16:46:32: 		CUDNN_PATH: C:\R\cudnn-7.5-windows10-x64-v5.0-ga\cuda
+08/22/2016 16:46:32: 		Build Branch: mahilleb/CuDnn5Test
+08/22/2016 16:46:32: 		Build SHA1: db500985aff6d7d67b90c1d0dedcbcd7f8ae7b96 (modified)
+08/22/2016 16:46:32: 		Built by mahilleb on mahilleb42
+08/22/2016 16:46:32: 		Build Path: C:\Users\mahilleb\Repos\CNTK\Source\CNTK\
+08/22/2016 16:46:32: -------------------------------------------------------------------
+08/22/2016 16:46:32: -------------------------------------------------------------------
+08/22/2016 16:46:32: GPU info:
+
+08/22/2016 16:46:32: 		Device[0]: cores = 960; computeCapability = 5.0; type = "Quadro M2000M"; memory = 4096 MB
+08/22/2016 16:46:32: -------------------------------------------------------------------
+
+08/22/2016 16:46:32: Running on mahilleb42 at 2016/08/22 16:46:32
+08/22/2016 16:46:32: Command line: 
+C:\Users\mahilleb\Repos\CNTK\x64\release\cntk.exe  configFile=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial/01_OneHidden.cntk  currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData  RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu  DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData  ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial  OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu  DeviceId=0  timestamping=true  batchNormalizationEngine=cudnn
+
+
+
+08/22/2016 16:46:32: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/22/2016 16:46:32: rootDir = ".."
+configDir = "$rootDir$/Config"
+dataDir   = "$rootDir$/Data"
+outputDir = "$rootDir$/Output"
+modelDir  = "$outputDir$/Models"
+deviceId = 0
+command = train:test
+precision = "float"
+modelPath = "$modelDir$/01_OneHidden"
+numMBsToShowResult = 500
+traceLevel = 1
+batchNormalizationEngine = "testMustOverrideBatchNormalizationEngine"
+train = [
+    action = "train"
+    NDLNetworkBuilder = [
+        initOnCPUOnly = true
+        networkDescription = "$ConfigDir$/01_OneHidden.ndl"
+    ]
+    SGD = [
+        epochSize = 60000
+        minibatchSize = 32
+        learningRatesPerSample = 0.003125
+        momentumAsTimeConstant = 0
+        maxEpochs = 3
+    ]
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Train-28x28_cntk_text.txt"
+        input = [
+            features = [
+                dim = 784
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]   
+]
+test = [
+    action = "test"
+minibatchSize = 1024    
+    evalNodeNames = ce:errs:top5Errs
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Test-28x28_cntk_text.txt"
+        input = [
+            features = [
+                dim = 784
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]
+]
+currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData
+RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu
+DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData
+ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial
+OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu
+DeviceId=0
+timestamping=true
+batchNormalizationEngine=cudnn
+
+08/22/2016 16:46:32: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+08/22/2016 16:46:32: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/22/2016 16:46:32: rootDir = ".."
+configDir = "../Config"
+dataDir   = "../Data"
+outputDir = "../Output"
+modelDir  = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu/Models"
+deviceId = 0
+command = train:test
+precision = "float"
+modelPath = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu/Models/01_OneHidden"
+numMBsToShowResult = 500
+traceLevel = 1
+batchNormalizationEngine = "testMustOverrideBatchNormalizationEngine"
+train = [
+    action = "train"
+    NDLNetworkBuilder = [
+        initOnCPUOnly = true
+        networkDescription = "C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial/01_OneHidden.ndl"
+    ]
+    SGD = [
+        epochSize = 60000
+        minibatchSize = 32
+        learningRatesPerSample = 0.003125
+        momentumAsTimeConstant = 0
+        maxEpochs = 3
+    ]
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData/Train-28x28_cntk_text.txt"
+        input = [
+            features = [
+                dim = 784
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]   
+]
+test = [
+    action = "test"
+minibatchSize = 1024    
+    evalNodeNames = ce:errs:top5Errs
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData/Test-28x28_cntk_text.txt"
+        input = [
+            features = [
+                dim = 784
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]
+]
+currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData
+RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu
+DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData
+ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial
+OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu
+DeviceId=0
+timestamping=true
+batchNormalizationEngine=cudnn
+
+08/22/2016 16:46:32: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+08/22/2016 16:46:32: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: 01_OneHidden.cntk:batchNormalizationEngine=cudnn
+configparameters: 01_OneHidden.cntk:command=train:test
+configparameters: 01_OneHidden.cntk:configDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial
+configparameters: 01_OneHidden.cntk:currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData
+configparameters: 01_OneHidden.cntk:dataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData
+configparameters: 01_OneHidden.cntk:deviceId=0
+configparameters: 01_OneHidden.cntk:modelDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu/Models
+configparameters: 01_OneHidden.cntk:modelPath=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu/Models/01_OneHidden
+configparameters: 01_OneHidden.cntk:numMBsToShowResult=500
+configparameters: 01_OneHidden.cntk:outputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu
+configparameters: 01_OneHidden.cntk:precision=float
+configparameters: 01_OneHidden.cntk:rootDir=..
+configparameters: 01_OneHidden.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu
+configparameters: 01_OneHidden.cntk:test=[
+    action = "test"
+minibatchSize = 1024    
+    evalNodeNames = ce:errs:top5Errs
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData/Test-28x28_cntk_text.txt"
+        input = [
+            features = [
+                dim = 784
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]
+]
+
+configparameters: 01_OneHidden.cntk:timestamping=true
+configparameters: 01_OneHidden.cntk:traceLevel=1
+configparameters: 01_OneHidden.cntk:train=[
+    action = "train"
+    NDLNetworkBuilder = [
+        initOnCPUOnly = true
+        networkDescription = "C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\NonSpatial/01_OneHidden.ndl"
+    ]
+    SGD = [
+        epochSize = 60000
+        minibatchSize = 32
+        learningRatesPerSample = 0.003125
+        momentumAsTimeConstant = 0
+        maxEpochs = 3
+    ]
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu\TestData/Train-28x28_cntk_text.txt"
+        input = [
+            features = [
+                dim = 784
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]   
+]
+
+08/22/2016 16:46:32: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/22/2016 16:46:32: Commands: train test
+08/22/2016 16:46:32: Precision = "float"
+08/22/2016 16:46:32: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu/Models/01_OneHidden
+08/22/2016 16:46:32: CNTKCommandTrainInfo: train : 3
+08/22/2016 16:46:32: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3
+
+08/22/2016 16:46:32: ##############################################################################
+08/22/2016 16:46:32: #                                                                            #
+08/22/2016 16:46:32: # Action "train"                                                             #
+08/22/2016 16:46:32: #                                                                            #
+08/22/2016 16:46:32: ##############################################################################
+
+08/22/2016 16:46:32: CNTKCommandTrainBegin: train
+NDLBuilder Using GPU 0
+
+08/22/2016 16:46:32: Creating virgin network.
+Node 'featScale' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 0.000000.
+Node 'h1.W' (LearnableParameter operation): Initializing Parameter[200 x 784] <- 0.000000.
+Node 'h1.b' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 0.000000.
+Node 'b' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 0.000000.
+Node 'sc' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 0.000000.
+Node 'm' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 0.000000.
+Node 'var' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 0.000000.
+Node 'ol.W' (LearnableParameter operation): Initializing Parameter[10 x 200] <- 0.000000.
+Node 'ol.b' (LearnableParameter operation): Initializing Parameter[10 x 1] <- 0.000000.
+Node 'unnamed32' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 0.000000.
+Node 'featScale' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 0.003906.
+Node 'featScale' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 0.003906.
+Node 'unnamed32' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 5.000000.
+Node 'featScale' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 0.003906.
+Node 'h1.W' (LearnableParameter operation): Initializing Parameter[200 x 784] <- uniform(seed=1, range=0.050000*1.000000, onCPU=true).
+Node 'h1.b' (LearnableParameter operation): Initializing Parameter[200 x 1] <- uniform(seed=2, range=0.050000*1.000000, onCPU=true).
+Node 'b' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 0.000000.
+Node 'sc' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 1.000000.
+Node 'm' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 0.000000.
+Node 'var' (LearnableParameter operation): Initializing Parameter[200 x 1] <- 0.000000.
+Node 'ol.W' (LearnableParameter operation): Initializing Parameter[10 x 200] <- uniform(seed=3, range=0.050000*1.000000, onCPU=true).
+Node 'ol.b' (LearnableParameter operation): Initializing Parameter[10 x 1] <- uniform(seed=4, range=0.050000*1.000000, onCPU=true).
+
+Post-processing network...
+
+4 roots:
+	ce = CrossEntropyWithSoftmax()
+	errs = ErrorPrediction()
+	ol.z = Plus()
+	top5Errs = ErrorPrediction()
+
+Validating network. 21 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [10 x *]
+Validating --> ol.W = LearnableParameter() :  -> [10 x 200]
+Validating --> h1.W = LearnableParameter() :  -> [200 x 784]
+Validating --> featScale = LearnableParameter() :  -> [1 x 1]
+Validating --> features = InputValue() :  -> [784 x *]
+Validating --> featScaled = ElementTimes (featScale, features) : [1 x 1], [784 x *] -> [784 x 1 x *]
+Validating --> h1.t = Times (h1.W, featScaled) : [200 x 784], [784 x 1 x *] -> [200 x 1 x *]
+Validating --> h1.b = LearnableParameter() :  -> [200 x 1]
+Validating --> h1.z = Plus (h1.t, h1.b) : [200 x 1 x *], [200 x 1] -> [200 x 1 x *]
+Validating --> sc = LearnableParameter() :  -> [200 x 1]
+Validating --> b = LearnableParameter() :  -> [200 x 1]
+Validating --> m = LearnableParameter() :  -> [200 x 1]
+Validating --> var = LearnableParameter() :  -> [200 x 1]
+Validating --> y = BatchNormalization (h1.z, sc, b, m, var) : [200 x 1 x *], [200 x 1], [200 x 1], [200 x 1], [200 x 1] -> [200 x 1 x *]
+Validating --> ol.t = Times (ol.W, y) : [10 x 200], [200 x 1 x *] -> [10 x 1 x *]
+Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
+Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *], [10 x 1] -> [10 x 1 x *]
+Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> errs = ErrorPrediction (labels, ol.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> unnamed32 = LearnableParameter() :  -> [1 x 1]
+Validating --> top5Errs = ErrorPrediction (labels, ol.z, unnamed32) : [10 x *], [10 x 1 x *], [1 x 1] -> [1]
+
+Validating network. 9 nodes to process in pass 2.
+
+
+Validating network, final pass.
+
+
+Using cuDNN batch normalization engine.
+
+
+13 out of 21 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+08/22/2016 16:46:33: Created model with 21 nodes on GPU 0.
+
+08/22/2016 16:46:33: Training criterion node(s):
+08/22/2016 16:46:33: 	ce = CrossEntropyWithSoftmax
+
+08/22/2016 16:46:33: Evaluation criterion node(s):
+08/22/2016 16:46:33: 	top5Errs = ErrorPrediction
+08/22/2016 16:46:33: 	errs = ErrorPrediction
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Memory Sharing: Out of 33 matrices, 8 are shared as 4, and 25 are not shared.
+
+	{ h1.W : [200 x 784] (gradient)
+	  h1.z : [200 x 1 x *] }
+	{ ol.W : [10 x 200] (gradient)
+	  ol.z : [10 x 1 x *] (gradient) }
+	{ h1.z : [200 x 1 x *] (gradient)
+	  ol.t : [10 x 1 x *] }
+	{ ol.t : [10 x 1 x *] (gradient)
+	  sc : [200 x 1] (gradient) }
+
+
+08/22/2016 16:46:33: Training 159410 parameters in 6 out of 6 parameter tensors and 12 nodes with gradient:
+
+08/22/2016 16:46:33: 	Node 'b' (LearnableParameter operation) : [200 x 1]
+08/22/2016 16:46:33: 	Node 'h1.W' (LearnableParameter operation) : [200 x 784]
+08/22/2016 16:46:33: 	Node 'h1.b' (LearnableParameter operation) : [200 x 1]
+08/22/2016 16:46:33: 	Node 'ol.W' (LearnableParameter operation) : [10 x 200]
+08/22/2016 16:46:33: 	Node 'ol.b' (LearnableParameter operation) : [10 x 1]
+08/22/2016 16:46:33: 	Node 'sc' (LearnableParameter operation) : [200 x 1]
+
+08/22/2016 16:46:33: No PreCompute nodes found, or all already computed. Skipping pre-computation step.
+
+08/22/2016 16:46:33: Starting Epoch 1: learning rate per sample = 0.003125  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+BlockRandomizer::StartEpoch: epoch 0: frames [0..60000] (first sequence at sample 0), data subset 0 of 1
+
+08/22/2016 16:46:33: Starting minibatch loop.
+08/22/2016 16:46:35:  Epoch[ 1 of 3]-Minibatch[   1- 500, 26.67%]: ce = 0.46662509 * 16000; top5Errs = 1.306% * 16000; errs = 13.969% * 16000; time = 2.5768s; samplesPerSecond = 6209.3
+08/22/2016 16:46:37:  Epoch[ 1 of 3]-Minibatch[ 501-1000, 53.33%]: ce = 0.39357101 * 16000; top5Errs = 0.794% * 16000; errs = 11.369% * 16000; time = 1.3959s; samplesPerSecond = 11461.8
+08/22/2016 16:46:38:  Epoch[ 1 of 3]-Minibatch[1001-1500, 80.00%]: ce = 0.37906537 * 16000; top5Errs = 0.769% * 16000; errs = 11.100% * 16000; time = 1.3856s; samplesPerSecond = 11547.1
+08/22/2016 16:46:39: Finished Epoch[ 1 of 3]: [Training] ce = 0.40404635 * 60000; top5Errs = 0.920% * 60000; errs = 11.822% * 60000; totalSamplesSeen = 60000; learningRatePerSample = 0.003125; epochTime=6.40517s
+08/22/2016 16:46:39: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu/Models/01_OneHidden.1'
+
+08/22/2016 16:46:39: Starting Epoch 2: learning rate per sample = 0.003125  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+BlockRandomizer::StartEpoch: epoch 1: frames [60000..120000] (first sequence at sample 60000), data subset 0 of 1
+
+08/22/2016 16:46:39: Starting minibatch loop.
+08/22/2016 16:46:40:  Epoch[ 2 of 3]-Minibatch[   1- 500, 26.67%]: ce = 0.34492102 * 16000; top5Errs = 0.613% * 16000; errs = 10.225% * 16000; time = 1.3936s; samplesPerSecond = 11480.9
+08/22/2016 16:46:42:  Epoch[ 2 of 3]-Minibatch[ 501-1000, 53.33%]: ce = 0.34236395 * 16000; top5Errs = 0.644% * 16000; errs = 10.144% * 16000; time = 1.3927s; samplesPerSecond = 11488.2
+08/22/2016 16:46:43:  Epoch[ 2 of 3]-Minibatch[1001-1500, 80.00%]: ce = 0.36281952 * 16000; top5Errs = 0.800% * 16000; errs = 10.331% * 16000; time = 1.3694s; samplesPerSecond = 11684.2
+08/22/2016 16:46:44: Finished Epoch[ 2 of 3]: [Training] ce = 0.34606566 * 60000; top5Errs = 0.663% * 60000; errs = 10.123% * 60000; totalSamplesSeen = 120000; learningRatePerSample = 0.003125; epochTime=5.20706s
+08/22/2016 16:46:44: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu/Models/01_OneHidden.2'
+
+08/22/2016 16:46:44: Starting Epoch 3: learning rate per sample = 0.003125  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+BlockRandomizer::StartEpoch: epoch 2: frames [120000..180000] (first sequence at sample 120000), data subset 0 of 1
+
+08/22/2016 16:46:44: Starting minibatch loop.
+08/22/2016 16:46:46:  Epoch[ 3 of 3]-Minibatch[   1- 500, 26.67%]: ce = 0.33230911 * 16000; top5Errs = 0.581% * 16000; errs = 9.469% * 16000; time = 1.3521s; samplesPerSecond = 11833.7
+08/22/2016 16:46:47:  Epoch[ 3 of 3]-Minibatch[ 501-1000, 53.33%]: ce = 0.32444919 * 16000; top5Errs = 0.531% * 16000; errs = 9.494% * 16000; time = 1.3505s; samplesPerSecond = 11847.9
+08/22/2016 16:46:48:  Epoch[ 3 of 3]-Minibatch[1001-1500, 80.00%]: ce = 0.33893469 * 16000; top5Errs = 0.631% * 16000; errs = 9.588% * 16000; time = 1.3574s; samplesPerSecond = 11786.9
+08/22/2016 16:46:49: Finished Epoch[ 3 of 3]: [Training] ce = 0.33093525 * 60000; top5Errs = 0.582% * 60000; errs = 9.548% * 60000; totalSamplesSeen = 180000; learningRatePerSample = 0.003125; epochTime=5.12492s
+08/22/2016 16:46:49: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\NonSpatial_CuDNN@release_gpu/Models/01_OneHidden'
+08/22/2016 16:46:50: CNTKCommandTrainEnd: train
+
+08/22/2016 16:46:50: Action "train" complete.
+
+
+08/22/2016 16:46:50: ##############################################################################
+08/22/2016 16:46:50: #                                                                            #
+08/22/2016 16:46:50: # Action "test"                                                              #
+08/22/2016 16:46:50: #                                                                            #
+08/22/2016 16:46:50: ##############################################################################
+
+INFO: y: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model
+
+Post-processing network...
+
+3 roots:
+	ce = CrossEntropyWithSoftmax()
+	errs = ErrorPrediction()
+	top5Errs = ErrorPrediction()
+
+Validating network. 21 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [10 x *1]
+Validating --> ol.W = LearnableParameter() :  -> [10 x 200]
+Validating --> h1.W = LearnableParameter() :  -> [200 x 784]
+Validating --> featScale = LearnableParameter() :  -> [1 x 1]
+Validating --> features = InputValue() :  -> [784 x *1]
+Validating --> featScaled = ElementTimes (featScale, features) : [1 x 1], [784 x *1] -> [784 x 1 x *1]
+Validating --> h1.t = Times (h1.W, featScaled) : [200 x 784], [784 x 1 x *1] -> [200 x 1 x *1]
+Validating --> h1.b = LearnableParameter() :  -> [200 x 1]
+Validating --> h1.z = Plus (h1.t, h1.b) : [200 x 1 x *1], [200 x 1] -> [200 x 1 x *1]
+Validating --> sc = LearnableParameter() :  -> [200 x 1]
+Validating --> b = LearnableParameter() :  -> [200 x 1]
+Validating --> m = LearnableParameter() :  -> [200 x 1]
+Validating --> var = LearnableParameter() :  -> [200 x 1]
+Validating --> y = BatchNormalization (h1.z, sc, b, m, var) : [200 x 1 x *1], [200 x 1], [200 x 1], [200 x 1], [200 x 1] -> [200 x 1 x *1]
+Validating --> ol.t = Times (ol.W, y) : [10 x 200], [200 x 1 x *1] -> [10 x 1 x *1]
+Validating --> ol.b = LearnableParameter() :  -> [10 x 1]
+Validating --> ol.z = Plus (ol.t, ol.b) : [10 x 1 x *1], [10 x 1] -> [10 x 1 x *1]
+Validating --> ce = CrossEntropyWithSoftmax (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> errs = ErrorPrediction (labels, ol.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> unnamed32 = LearnableParameter() :  -> [1 x 1]
+Validating --> top5Errs = ErrorPrediction (labels, ol.z, unnamed32) : [10 x *1], [10 x 1 x *1], [1 x 1] -> [1]
+
+Validating network. 9 nodes to process in pass 2.
+
+
+Validating network, final pass.
+
+
+Using cuDNN batch normalization engine.
+
+
+13 out of 21 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Memory Sharing: Out of 21 matrices, 0 are shared as 0, and 21 are not shared.
+
+
+BlockRandomizer::StartEpoch: epoch 0: frames [0..10000] (first sequence at sample 0), data subset 0 of 1
+08/22/2016 16:46:50: Minibatch[1-10]: ce = 0.29474274 * 10000; errs = 7.990% * 10000; top5Errs = 0.540% * 10000
+08/22/2016 16:46:50: Final Results: Minibatch[1-10]: ce = 0.29474274 * 10000; perplexity = 1.34278087; errs = 7.990% * 10000; top5Errs = 0.540% * 10000
+
+08/22/2016 16:46:50: Action "test" complete.
+
+08/22/2016 16:46:50: __COMPLETED__
\ No newline at end of file
diff --git a/Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/run-test b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/run-test
new file mode 100755
index 000000000000..885c49683b2f
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/run-test
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+. $TEST_DIR/../run-test-common
+
+cntkrun 01_OneHidden.cntk batchNormalizationEngine=cudnn
+ExitCode=$?
+
+# Delete the test data if copied
+[[ "$Copied" -eq "1" ]] && rm -rf "$DataDir"
+
+exit $ExitCode
diff --git a/Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/testcases.yml b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/testcases.yml
new file mode 100644
index 000000000000..47400121d673
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/testcases.yml
@@ -0,0 +1,39 @@
+dataDir: ../../../../../Examples/Image/MNIST/Data
+
+tags:
+    # running on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations:
+    - bvt-s  (build_sku == 'gpu') and ((flavor=='debug') ^ (device=='cpu'))
+    # running unconditionally on every Nightly job in 'S' leg
+    - nightly-s (build_sku == 'gpu')
+
+testCases:
+  CNTK Run must be completed:
+    patterns:
+      - __COMPLETED__
+
+  Must train epochs in exactly same order and parameters:
+    patterns:
+      - Starting Epoch {{integer}}
+      - learning rate per sample = {{float}}
+      - momentum = {{float}}
+
+  Epochs must be finished with expected results:
+    patterns:
+      - Finished Epoch[{{integer}} of {{integer}}]
+      - ce = {{float,tolerance=.1%}} * {{integer}}
+      - errs = {{float,tolerance=.1%}}% * {{integer}}
+      - totalSamplesSeen = {{integer}}
+      - learningRatePerSample = {{float,tolerance=0.001%}}
+
+  Per-minibatch training results must match:
+    patterns:
+      - Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
+      - ce = {{float,tolerance=.1%}} * {{integer}}
+      - errs = {{float,tolerance=.1%}}% * {{integer}}
+
+  Final test results must match:
+    patterns:
+      - "Final Results: Minibatch[{{integer}}-{{integer}}]"
+      - top5Errs = {{float,tolerance=.1%}}% * {{integer}}
+      - errs = {{float,tolerance=.1%}}% * {{integer}}
+      - ce = {{float,tolerance=.1%}} * {{integer}}
diff --git a/Tests/EndToEndTests/BatchNormalization/NonSpatial/run-test-common b/Tests/EndToEndTests/BatchNormalization/NonSpatial/run-test-common
new file mode 100755
index 000000000000..a7abea93d792
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/run-test-common
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+. $TEST_ROOT_DIR/run-test-common
+
+ConfigDir=$TEST_DIR/..
+
+if [[ ! -d $TEST_DATA_DIR || ! -e $TEST_DATA_DIR/Train-28x28_cntk_text.txt || ! -e $TEST_DATA_DIR/Test-28x28_cntk_text.txt ]]; then
+    # Cannot find test data locally.
+    # Try external test data directory (not part of the CNTK repository) as an alternative.
+    if [[ -d "$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY" ]]; then
+        if [ "$OS" == "Windows_NT" ]; then
+            DataSourceDir=`cygpath -au $CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY`/Image/MNIST/v0
+        else
+            DataSourceDir=$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY/Image/MNIST/v0
+        fi
+
+        # Copy the test data to the test run directory
+        DataDir=$TEST_RUN_DIR/TestData
+        mkdir $DataDir
+        cp -R $DataSourceDir/*_cntk_text.txt $DataDir || exit $?
+        Copied=1
+    else
+        echo Error: cannot find data. Please see Examples/Image/MNIST/README.md for instructions to get it.
+        exit 1
+    fi
+fi
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.cntk b/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.cntk
new file mode 100644
index 000000000000..baab959f7ab2
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.cntk
@@ -0,0 +1,79 @@
+RootDir = "."
+
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+
+ndlMacros = "$ConfigDir$/Macros.ndl"
+
+precision = "float"
+deviceId = 0
+imageLayout = "cudnn"
+
+batchNormalizationEngine = "testMustOverrideBatchNormalizationEngine"
+
+# If set to true, always initialize the network on CPU, making initialization consistent across CPU and GPU targets (for testing).
+initOnCPUOnly=true
+
+command = Train:Test
+
+traceLevel = 1
+numMBsToShowResult = 500
+
+Train = [
+    action = "train"
+    modelPath = "$ModelDir$/02_BatchNormConv"
+
+     NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/02_BatchNormConv.ndl"
+    ]
+    
+    SGD = [
+        epochSize = 1024
+        minibatchSize = 64
+        learningRatesPerMB = 0.03*7:0.01
+        momentumPerMB = 0
+        maxEpochs = 2
+        L2RegWeight = 0
+        dropoutRate = 0
+    ]
+    
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        # See REAMDE.md for details on getting the data (Train_cntk_text.txt).
+        file = "$DataDir$/Train_cntk_text.txt"
+        input = [
+            features = [
+                dim = 3072
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]    
+]
+
+Test = [
+    action = "test"
+    modelPath = "$ModelDir$/02_BatchNormConv"
+    # Set minibatch size for testing.
+    minibatchSize = 16
+
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Test_cntk_text.txt"
+        input = [
+            features = [
+                dim = 3072
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]    
+]
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.ndl b/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.ndl
new file mode 100644
index 000000000000..38e3a4e79008
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.ndl
@@ -0,0 +1,65 @@
+load=ndlMnistMacros
+run=DNN
+
+ndlMnistMacros = [
+    ImageW = 32
+    ImageH = 32
+    ImageC = 3
+    LabelDim = 10
+
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = $imageLayout$)
+    featOffs = Const(128)
+    featScaled = Minus(features, featOffs)
+    labels = Input(LabelDim, tag = label)
+    
+    conv1WScale = 0.0043
+    conv1BValue = 0
+    conv2WScale = 1.414
+    conv2BValue = 0
+    conv3WScale = 1.414
+    conv3BValue = 0
+    
+    scValue = 1
+    
+    # Batch normalization time constant.
+    #bnTimeConst = 4096
+    bnTimeConst = 2048
+  
+    fc1WScale = 12
+    fc1BValue = 0
+    fc2WScale = 1.5
+    fc2BValue = 0
+]
+
+DNN=[
+    # conv1
+    kW1 = 5
+    kH1 = 5
+    cMap1 = 3
+    hStride1 = 1
+    vStride1 = 1
+    # weight[cMap1, kW1 * kH1 * ImageC]
+    b = LearnableParameter(cMap1, 1, init = fixedValue, value = conv1BValue)
+    sc = LearnableParameter(cMap1, 1, init = fixedValue, value = scValue)
+    m = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    
+    y = BatchNormalization(featScaled, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=$batchNormalizationEngine$)
+    conv1 = RectifiedLinear(y)
+
+    # pool1
+    pool1W = 3
+    pool1H = 3
+    pool1hStride = 2
+    pool1vStride = 2
+    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout = $imageLayout$)
+
+    hiddenDim = 64
+    h1 = DNNImageReLULayer(15, 15, cMap1, hiddenDim, pool1, fc1WScale, fc1BValue)
+    ol = DNNLastLayer(hiddenDim, labelDim, h1, fc2WScale, fc2BValue)
+    
+    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
+    Err = ErrorPrediction(labels, ol, tag = Eval)
+    OutputNodes = ol
+]
+
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/baseline.linux.txt b/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/baseline.linux.txt
new file mode 100644
index 000000000000..1333ed77b7e1
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/baseline.linux.txt
@@ -0,0 +1 @@
+TODO
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/baseline.windows.txt b/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/baseline.windows.txt
new file mode 100644
index 000000000000..3b68dcef004a
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/baseline.windows.txt
@@ -0,0 +1,494 @@
+CPU info:
+    CPU Model Name: Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz
+    Hardware threads: 8
+    Total Memory: 33417320 kB
+-------------------------------------------------------------------
+=== Running /cygdrive/c/Users/mahilleb/Repos/CNTK/x64/release/cntk.exe configFile=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial/02_BatchNormConv.cntk currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu DeviceId=0 timestamping=true batchNormalizationEngine=cudnn
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Aug 22 2016 17:36:51
+		Last modified date: Fri Aug 19 10:26:01 2016
+		Build type: Release
+		Build target: GPU
+		With 1bit-SGD: yes
+		Math lib: mkl
+		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+		CUB_PATH: C:\R\cub-1.4.1
+		CUDNN_PATH: C:\R\cudnn-7.5-windows10-x64-v5.0-ga\cuda
+		Build Branch: mahilleb/CuDnn5Test
+		Build SHA1: db500985aff6d7d67b90c1d0dedcbcd7f8ae7b96 (modified)
+		Built by mahilleb on mahilleb42
+		Build Path: C:\Users\mahilleb\Repos\CNTK\Source\CNTK\
+-------------------------------------------------------------------
+Changed current directory to C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData
+08/22/2016 16:47:01: -------------------------------------------------------------------
+08/22/2016 16:47:01: Build info: 
+
+08/22/2016 16:47:01: 		Built time: Aug 22 2016 17:36:51
+08/22/2016 16:47:01: 		Last modified date: Fri Aug 19 10:26:01 2016
+08/22/2016 16:47:01: 		Build type: Release
+08/22/2016 16:47:01: 		Build target: GPU
+08/22/2016 16:47:01: 		With 1bit-SGD: yes
+08/22/2016 16:47:01: 		Math lib: mkl
+08/22/2016 16:47:01: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+08/22/2016 16:47:01: 		CUB_PATH: C:\R\cub-1.4.1
+08/22/2016 16:47:01: 		CUDNN_PATH: C:\R\cudnn-7.5-windows10-x64-v5.0-ga\cuda
+08/22/2016 16:47:01: 		Build Branch: mahilleb/CuDnn5Test
+08/22/2016 16:47:01: 		Build SHA1: db500985aff6d7d67b90c1d0dedcbcd7f8ae7b96 (modified)
+08/22/2016 16:47:01: 		Built by mahilleb on mahilleb42
+08/22/2016 16:47:01: 		Build Path: C:\Users\mahilleb\Repos\CNTK\Source\CNTK\
+08/22/2016 16:47:01: -------------------------------------------------------------------
+08/22/2016 16:47:01: -------------------------------------------------------------------
+08/22/2016 16:47:01: GPU info:
+
+08/22/2016 16:47:01: 		Device[0]: cores = 960; computeCapability = 5.0; type = "Quadro M2000M"; memory = 4096 MB
+08/22/2016 16:47:01: -------------------------------------------------------------------
+
+08/22/2016 16:47:01: Running on mahilleb42 at 2016/08/22 16:47:01
+08/22/2016 16:47:01: Command line: 
+C:\Users\mahilleb\Repos\CNTK\x64\release\cntk.exe  configFile=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial/02_BatchNormConv.cntk  currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData  RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu  DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData  ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial  OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu  DeviceId=0  timestamping=true  batchNormalizationEngine=cudnn
+
+
+
+08/22/2016 16:47:01: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/22/2016 16:47:01: RootDir = "."
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+ndlMacros = "$ConfigDir$/Macros.ndl"
+precision = "float"
+deviceId = 0
+imageLayout = "cudnn"
+batchNormalizationEngine = "testMustOverrideBatchNormalizationEngine"
+initOnCPUOnly=true
+command = Train:Test
+traceLevel = 1
+numMBsToShowResult = 500
+Train = [
+    action = "train"
+    modelPath = "$ModelDir$/02_BatchNormConv"
+     NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/02_BatchNormConv.ndl"
+    ]
+    SGD = [
+        epochSize = 1024
+        minibatchSize = 64
+        learningRatesPerMB = 0.03*7:0.01
+        momentumPerMB = 0
+        maxEpochs = 2
+        L2RegWeight = 0
+        dropoutRate = 0
+    ]
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Train_cntk_text.txt"
+        input = [
+            features = [
+                dim = 3072
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]    
+]
+Test = [
+    action = "test"
+    modelPath = "$ModelDir$/02_BatchNormConv"
+    minibatchSize = 16
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Test_cntk_text.txt"
+        input = [
+            features = [
+                dim = 3072
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]    
+]
+currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData
+RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu
+DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData
+ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial
+OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu
+DeviceId=0
+timestamping=true
+batchNormalizationEngine=cudnn
+
+08/22/2016 16:47:01: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+08/22/2016 16:47:01: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/22/2016 16:47:01: RootDir = "."
+ConfigDir = "."
+DataDir = "."
+OutputDir = "./Output"
+ModelDir = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models"
+ndlMacros = "C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial/Macros.ndl"
+precision = "float"
+deviceId = 0
+imageLayout = "cudnn"
+batchNormalizationEngine = "testMustOverrideBatchNormalizationEngine"
+initOnCPUOnly=true
+command = Train:Test
+traceLevel = 1
+numMBsToShowResult = 500
+Train = [
+    action = "train"
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models/02_BatchNormConv"
+     NDLNetworkBuilder = [
+        networkDescription = "C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial/02_BatchNormConv.ndl"
+    ]
+    SGD = [
+        epochSize = 1024
+        minibatchSize = 64
+        learningRatesPerMB = 0.03*7:0.01
+        momentumPerMB = 0
+        maxEpochs = 2
+        L2RegWeight = 0
+        dropoutRate = 0
+    ]
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData/Train_cntk_text.txt"
+        input = [
+            features = [
+                dim = 3072
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]    
+]
+Test = [
+    action = "test"
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models/02_BatchNormConv"
+    minibatchSize = 16
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData/Test_cntk_text.txt"
+        input = [
+            features = [
+                dim = 3072
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]    
+]
+currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData
+RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu
+DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData
+ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial
+OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu
+DeviceId=0
+timestamping=true
+batchNormalizationEngine=cudnn
+
+08/22/2016 16:47:01: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+08/22/2016 16:47:01: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: 02_BatchNormConv.cntk:batchNormalizationEngine=cudnn
+configparameters: 02_BatchNormConv.cntk:command=Train:Test
+configparameters: 02_BatchNormConv.cntk:ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial
+configparameters: 02_BatchNormConv.cntk:currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData
+configparameters: 02_BatchNormConv.cntk:DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData
+configparameters: 02_BatchNormConv.cntk:deviceId=0
+configparameters: 02_BatchNormConv.cntk:imageLayout=cudnn
+configparameters: 02_BatchNormConv.cntk:initOnCPUOnly=true
+configparameters: 02_BatchNormConv.cntk:ModelDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models
+configparameters: 02_BatchNormConv.cntk:ndlMacros=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial/Macros.ndl
+configparameters: 02_BatchNormConv.cntk:numMBsToShowResult=500
+configparameters: 02_BatchNormConv.cntk:OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu
+configparameters: 02_BatchNormConv.cntk:precision=float
+configparameters: 02_BatchNormConv.cntk:RootDir=.
+configparameters: 02_BatchNormConv.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu
+configparameters: 02_BatchNormConv.cntk:Test=[
+    action = "test"
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models/02_BatchNormConv"
+    minibatchSize = 16
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData/Test_cntk_text.txt"
+        input = [
+            features = [
+                dim = 3072
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]    
+]
+
+configparameters: 02_BatchNormConv.cntk:timestamping=true
+configparameters: 02_BatchNormConv.cntk:traceLevel=1
+configparameters: 02_BatchNormConv.cntk:Train=[
+    action = "train"
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models/02_BatchNormConv"
+     NDLNetworkBuilder = [
+        networkDescription = "C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial/02_BatchNormConv.ndl"
+    ]
+    SGD = [
+        epochSize = 1024
+        minibatchSize = 64
+        learningRatesPerMB = 0.03*7:0.01
+        momentumPerMB = 0
+        maxEpochs = 2
+        L2RegWeight = 0
+        dropoutRate = 0
+    ]
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData/Train_cntk_text.txt"
+        input = [
+            features = [
+                dim = 3072
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]    
+]
+
+08/22/2016 16:47:01: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/22/2016 16:47:01: Commands: Train Test
+08/22/2016 16:47:01: Precision = "float"
+08/22/2016 16:47:01: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models/02_BatchNormConv
+08/22/2016 16:47:01: CNTKCommandTrainInfo: Train : 2
+08/22/2016 16:47:01: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 2
+
+08/22/2016 16:47:01: ##############################################################################
+08/22/2016 16:47:01: #                                                                            #
+08/22/2016 16:47:01: # Action "train"                                                             #
+08/22/2016 16:47:01: #                                                                            #
+08/22/2016 16:47:01: ##############################################################################
+
+08/22/2016 16:47:01: CNTKCommandTrainBegin: Train
+NDLBuilder Using GPU 0
+
+08/22/2016 16:47:02: Creating virgin network.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 0.000000.
+Node 'b' (LearnableParameter operation): Initializing Parameter[3 x 1] <- 0.000000.
+Node 'sc' (LearnableParameter operation): Initializing Parameter[3 x 1] <- 0.000000.
+Node 'm' (LearnableParameter operation): Initializing Parameter[3 x 1] <- 0.000000.
+Node 'var' (LearnableParameter operation): Initializing Parameter[3 x 1] <- 0.000000.
+Node 'h1.W' (LearnableParameter operation): Initializing Parameter[64 x 15 x 15 x 3] <- 0.000000.
+Node 'h1.b' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'OutputNodes.W' (LearnableParameter operation): Initializing Parameter[10 x 64] <- 0.000000.
+Node 'OutputNodes.b' (LearnableParameter operation): Initializing Parameter[10] <- 0.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'b' (LearnableParameter operation): Initializing Parameter[3 x 1] <- 0.000000.
+Node 'sc' (LearnableParameter operation): Initializing Parameter[3 x 1] <- 1.000000.
+Node 'm' (LearnableParameter operation): Initializing Parameter[3 x 1] <- 0.000000.
+Node 'var' (LearnableParameter operation): Initializing Parameter[3 x 1] <- 0.000000.
+Node 'h1.W' (LearnableParameter operation): Initializing Parameter[64 x 15 x 15 x 3] <- gaussian(seed=1, range=0.007698*12.000000, onCPU=false).
+Microsoft::MSR::CNTK::GPUMatrix<ElemType>::SetGaussianRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4
+Node 'h1.b' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'OutputNodes.W' (LearnableParameter operation): Initializing Parameter[10 x 64] <- gaussian(seed=2, range=0.025000*1.500000, onCPU=false).
+Node 'OutputNodes.b' (LearnableParameter operation): Initializing Parameter[10] <- 0.000000.
+
+Post-processing network...
+
+3 roots:
+	CE = CrossEntropyWithSoftmax()
+	Err = ErrorPrediction()
+	OutputNodes.z = Plus()
+
+Validating network. 22 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [10 x *]
+Validating --> OutputNodes.W = LearnableParameter() :  -> [10 x 64]
+Validating --> h1.W = LearnableParameter() :  -> [64 x 15 x 15 x 3]
+Validating --> features = InputValue() :  -> [32 x 32 x 3 x *]
+Validating --> featOffs = LearnableParameter() :  -> [1 x 1]
+Validating --> featScaled = Minus (features, featOffs) : [32 x 32 x 3 x *], [1 x 1] -> [32 x 32 x 3 x *]
+Validating --> sc = LearnableParameter() :  -> [3 x 1]
+Validating --> b = LearnableParameter() :  -> [3 x 1]
+Validating --> m = LearnableParameter() :  -> [3 x 1]
+Validating --> var = LearnableParameter() :  -> [3 x 1]
+Validating --> y = BatchNormalization (featScaled, sc, b, m, var) : [32 x 32 x 3 x *], [3 x 1], [3 x 1], [3 x 1], [3 x 1] -> [32 x 32 x 3 x *]
+Validating --> conv1 = RectifiedLinear (y) : [32 x 32 x 3 x *] -> [32 x 32 x 3 x *]
+Validating --> pool1 = MaxPooling (conv1) : [32 x 32 x 3 x *] -> [15 x 15 x 3 x *]
+Validating --> h1.t = Times (h1.W, pool1) : [64 x 15 x 15 x 3], [15 x 15 x 3 x *] -> [64 x *]
+Validating --> h1.b = LearnableParameter() :  -> [64 x 1]
+Validating --> h1.z = Plus (h1.t, h1.b) : [64 x *], [64 x 1] -> [64 x 1 x *]
+Validating --> h1.y = RectifiedLinear (h1.z) : [64 x 1 x *] -> [64 x 1 x *]
+Validating --> OutputNodes.t = Times (OutputNodes.W, h1.y) : [10 x 64], [64 x 1 x *] -> [10 x 1 x *]
+Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
+Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x 1 x *], [10] -> [10 x 1 x *]
+Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x 1 x *] -> [1]
+
+Validating network. 11 nodes to process in pass 2.
+
+
+Validating network, final pass.
+
+
+Using cuDNN batch normalization engine.
+
+pool1: using cuDNN convolution engine for geometry: Input: 32 x 32 x 3, Output: 15 x 15 x 3, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+
+
+11 out of 22 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+08/22/2016 16:47:02: Created model with 22 nodes on GPU 0.
+
+08/22/2016 16:47:02: Training criterion node(s):
+08/22/2016 16:47:02: 	CE = CrossEntropyWithSoftmax
+
+08/22/2016 16:47:02: Evaluation criterion node(s):
+08/22/2016 16:47:02: 	Err = ErrorPrediction
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Memory Sharing: Out of 37 matrices, 18 are shared as 8, and 19 are not shared.
+
+	{ OutputNodes.t : [10 x 1 x *]
+	  h1.z : [64 x 1 x *] (gradient)
+	  pool1 : [15 x 15 x 3 x *] (gradient) }
+	{ h1.W : [64 x 15 x 15 x 3] (gradient)
+	  h1.z : [64 x 1 x *] }
+	{ h1.b : [64 x 1] (gradient)
+	  h1.y : [64 x 1 x *] (gradient) }
+	{ b : [3 x 1] (gradient)
+	  conv1 : [32 x 32 x 3 x *] (gradient)
+	  h1.t : [64 x *] }
+	{ OutputNodes.W : [10 x 64] (gradient)
+	  OutputNodes.z : [10 x 1 x *] (gradient) }
+	{ conv1 : [32 x 32 x 3 x *]
+	  sc : [3 x 1] (gradient) }
+	{ h1.t : [64 x *] (gradient)
+	  h1.y : [64 x 1 x *] }
+	{ pool1 : [15 x 15 x 3 x *]
+	  y : [32 x 32 x 3 x *] (gradient) }
+
+
+08/22/2016 16:47:02: Training 43920 parameters in 6 out of 6 parameter tensors and 15 nodes with gradient:
+
+08/22/2016 16:47:02: 	Node 'OutputNodes.W' (LearnableParameter operation) : [10 x 64]
+08/22/2016 16:47:02: 	Node 'OutputNodes.b' (LearnableParameter operation) : [10]
+08/22/2016 16:47:02: 	Node 'b' (LearnableParameter operation) : [3 x 1]
+08/22/2016 16:47:02: 	Node 'h1.W' (LearnableParameter operation) : [64 x 15 x 15 x 3]
+08/22/2016 16:47:02: 	Node 'h1.b' (LearnableParameter operation) : [64 x 1]
+08/22/2016 16:47:02: 	Node 'sc' (LearnableParameter operation) : [3 x 1]
+
+08/22/2016 16:47:02: No PreCompute nodes found, or all already computed. Skipping pre-computation step.
+
+08/22/2016 16:47:02: Starting Epoch 1: learning rate per sample = 0.000469  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+BlockRandomizer::StartEpoch: epoch 0: frames [0..1024] (first sequence at sample 0), data subset 0 of 1
+
+08/22/2016 16:47:02: Starting minibatch loop.
+08/22/2016 16:47:06: Finished Epoch[ 1 of 2]: [Training] CE = 2.32421112 * 1024; Err = 0.87109375 * 1024; totalSamplesSeen = 1024; learningRatePerSample = 0.00046874999; epochTime=4.11732s
+08/22/2016 16:47:06: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models/02_BatchNormConv.1'
+
+08/22/2016 16:47:06: Starting Epoch 2: learning rate per sample = 0.000469  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+BlockRandomizer::StartEpoch: epoch 1: frames [1024..2048] (first sequence at sample 1024), data subset 0 of 1
+
+08/22/2016 16:47:06: Starting minibatch loop.
+08/22/2016 16:47:06: Finished Epoch[ 2 of 2]: [Training] CE = 2.24204946 * 1024; Err = 0.84375000 * 1024; totalSamplesSeen = 2048; learningRatePerSample = 0.00046874999; epochTime=0.053071s
+08/22/2016 16:47:06: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models/02_BatchNormConv'
+08/22/2016 16:47:06: CNTKCommandTrainEnd: Train
+
+08/22/2016 16:47:06: Action "train" complete.
+
+
+08/22/2016 16:47:06: ##############################################################################
+08/22/2016 16:47:06: #                                                                            #
+08/22/2016 16:47:06: # Action "test"                                                              #
+08/22/2016 16:47:06: #                                                                            #
+08/22/2016 16:47:06: ##############################################################################
+
+INFO: y: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model
+
+Post-processing network...
+
+3 roots:
+	CE = CrossEntropyWithSoftmax()
+	Err = ErrorPrediction()
+	OutputNodes.z = Plus()
+
+Validating network. 22 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [10 x *1]
+Validating --> OutputNodes.W = LearnableParameter() :  -> [10 x 64]
+Validating --> h1.W = LearnableParameter() :  -> [64 x 15 x 15 x 3]
+Validating --> features = InputValue() :  -> [32 x 32 x 3 x *1]
+Validating --> featOffs = LearnableParameter() :  -> [1 x 1]
+Validating --> featScaled = Minus (features, featOffs) : [32 x 32 x 3 x *1], [1 x 1] -> [32 x 32 x 3 x *1]
+Validating --> sc = LearnableParameter() :  -> [3 x 1]
+Validating --> b = LearnableParameter() :  -> [3 x 1]
+Validating --> m = LearnableParameter() :  -> [3 x 1]
+Validating --> var = LearnableParameter() :  -> [3 x 1]
+Validating --> y = BatchNormalization (featScaled, sc, b, m, var) : [32 x 32 x 3 x *1], [3 x 1], [3 x 1], [3 x 1], [3 x 1] -> [32 x 32 x 3 x *1]
+Validating --> conv1 = RectifiedLinear (y) : [32 x 32 x 3 x *1] -> [32 x 32 x 3 x *1]
+Validating --> pool1 = MaxPooling (conv1) : [32 x 32 x 3 x *1] -> [15 x 15 x 3 x *1]
+Validating --> h1.t = Times (h1.W, pool1) : [64 x 15 x 15 x 3], [15 x 15 x 3 x *1] -> [64 x *1]
+Validating --> h1.b = LearnableParameter() :  -> [64 x 1]
+Validating --> h1.z = Plus (h1.t, h1.b) : [64 x *1], [64 x 1] -> [64 x 1 x *1]
+Validating --> h1.y = RectifiedLinear (h1.z) : [64 x 1 x *1] -> [64 x 1 x *1]
+Validating --> OutputNodes.t = Times (OutputNodes.W, h1.y) : [10 x 64], [64 x 1 x *1] -> [10 x 1 x *1]
+Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
+Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x 1 x *1], [10] -> [10 x 1 x *1]
+Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x 1 x *1] -> [1]
+
+Validating network. 11 nodes to process in pass 2.
+
+
+Validating network, final pass.
+
+
+Using cuDNN batch normalization engine.
+
+pool1: using cuDNN convolution engine for geometry: Input: 32 x 32 x 3, Output: 15 x 15 x 3, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+
+
+11 out of 22 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Memory Sharing: Out of 22 matrices, 0 are shared as 0, and 22 are not shared.
+
+
+BlockRandomizer::StartEpoch: epoch 0: frames [0..10000] (first sequence at sample 0), data subset 0 of 1
+08/22/2016 16:47:08: Minibatch[1-500]: Err = 0.81025000 * 8000; CE = 2.19966818 * 8000
+08/22/2016 16:47:08: Minibatch[501-625]: Err = 0.82000000 * 2000; CE = 2.20878254 * 2000
+08/22/2016 16:47:08: Final Results: Minibatch[1-625]: Err = 0.81220000 * 10000; CE = 2.20149105 * 10000; perplexity = 9.03848028
+
+08/22/2016 16:47:08: Action "test" complete.
+
+08/22/2016 16:47:08: __COMPLETED__
\ No newline at end of file
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/run-test b/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/run-test
new file mode 100755
index 000000000000..e17f3ee002a0
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/run-test
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+. $TEST_DIR/../run-test-common
+
+OriginalTestDir=../CuDNN
+
+(cd $TEST_DIR/$OriginalTestDir && md5sum baseline*) | (cd $TEST_DIR && md5sum --status -c -)
+if [ $? != 0 ]; then
+  echo Error: Baselines must match original test. Copy from $OriginalTestDir.
+  exit 1
+fi
+
+cntkrun 02_BatchNormConv.cntk batchNormalizationEngine=cntk
+ExitCode=$?
+
+# Delete the test data if copied
+[[ "$Copied" -eq "1" ]] && rm -rf "$DataDir"
+
+exit $ExitCode
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/testcases.yml b/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/testcases.yml
new file mode 100644
index 000000000000..459fd20b5194
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/testcases.yml
@@ -0,0 +1,31 @@
+dataDir: ../../../../../Examples/Image/Miscellaneous/CIFAR-10
+
+tags:
+    # CPU training for BatchNormalization not supported.
+    - bvt-e (build_sku=='gpu') and (device=='gpu') and (flavor=='release')
+    - nightly-e (build_sku=='gpu') and (device=='gpu')
+
+testCases:
+  CNTK Run must be completed:
+    patterns:
+      - __COMPLETED__
+
+  Must train epochs in exactly same order and parameters:
+    patterns:
+      - Starting Epoch {{integer}}
+      - learning rate per sample = {{float}}
+      - momentum = {{float}}
+
+  Epochs must be finished with expected results:
+    patterns:
+      - Finished Epoch
+      - CE = {{float,tolerance=2.0%}} * {{integer}}
+      - Err = {{float,tolerance=2.0%}} * {{integer}}
+      - totalSamplesSeen = {{integer}}
+      - learningRatePerSample = {{float,tolerance=0.001%}}
+
+  Final test results must match:
+    patterns:
+      - "Final Results: Minibatch[{{integer}}-{{integer}}]"
+      - Err = {{float,tolerance=2.0%}} * {{integer}}
+      - CE = {{float,tolerance=2.0%}} * {{integer}}
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/baseline.linux.txt b/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/baseline.linux.txt
new file mode 100644
index 000000000000..1333ed77b7e1
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/baseline.linux.txt
@@ -0,0 +1 @@
+TODO
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/baseline.windows.txt b/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/baseline.windows.txt
new file mode 100644
index 000000000000..3b68dcef004a
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/baseline.windows.txt
@@ -0,0 +1,494 @@
+CPU info:
+    CPU Model Name: Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz
+    Hardware threads: 8
+    Total Memory: 33417320 kB
+-------------------------------------------------------------------
+=== Running /cygdrive/c/Users/mahilleb/Repos/CNTK/x64/release/cntk.exe configFile=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial/02_BatchNormConv.cntk currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu DeviceId=0 timestamping=true batchNormalizationEngine=cudnn
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Aug 22 2016 17:36:51
+		Last modified date: Fri Aug 19 10:26:01 2016
+		Build type: Release
+		Build target: GPU
+		With 1bit-SGD: yes
+		Math lib: mkl
+		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+		CUB_PATH: C:\R\cub-1.4.1
+		CUDNN_PATH: C:\R\cudnn-7.5-windows10-x64-v5.0-ga\cuda
+		Build Branch: mahilleb/CuDnn5Test
+		Build SHA1: db500985aff6d7d67b90c1d0dedcbcd7f8ae7b96 (modified)
+		Built by mahilleb on mahilleb42
+		Build Path: C:\Users\mahilleb\Repos\CNTK\Source\CNTK\
+-------------------------------------------------------------------
+Changed current directory to C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData
+08/22/2016 16:47:01: -------------------------------------------------------------------
+08/22/2016 16:47:01: Build info: 
+
+08/22/2016 16:47:01: 		Built time: Aug 22 2016 17:36:51
+08/22/2016 16:47:01: 		Last modified date: Fri Aug 19 10:26:01 2016
+08/22/2016 16:47:01: 		Build type: Release
+08/22/2016 16:47:01: 		Build target: GPU
+08/22/2016 16:47:01: 		With 1bit-SGD: yes
+08/22/2016 16:47:01: 		Math lib: mkl
+08/22/2016 16:47:01: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+08/22/2016 16:47:01: 		CUB_PATH: C:\R\cub-1.4.1
+08/22/2016 16:47:01: 		CUDNN_PATH: C:\R\cudnn-7.5-windows10-x64-v5.0-ga\cuda
+08/22/2016 16:47:01: 		Build Branch: mahilleb/CuDnn5Test
+08/22/2016 16:47:01: 		Build SHA1: db500985aff6d7d67b90c1d0dedcbcd7f8ae7b96 (modified)
+08/22/2016 16:47:01: 		Built by mahilleb on mahilleb42
+08/22/2016 16:47:01: 		Build Path: C:\Users\mahilleb\Repos\CNTK\Source\CNTK\
+08/22/2016 16:47:01: -------------------------------------------------------------------
+08/22/2016 16:47:01: -------------------------------------------------------------------
+08/22/2016 16:47:01: GPU info:
+
+08/22/2016 16:47:01: 		Device[0]: cores = 960; computeCapability = 5.0; type = "Quadro M2000M"; memory = 4096 MB
+08/22/2016 16:47:01: -------------------------------------------------------------------
+
+08/22/2016 16:47:01: Running on mahilleb42 at 2016/08/22 16:47:01
+08/22/2016 16:47:01: Command line: 
+C:\Users\mahilleb\Repos\CNTK\x64\release\cntk.exe  configFile=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial/02_BatchNormConv.cntk  currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData  RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu  DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData  ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial  OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu  DeviceId=0  timestamping=true  batchNormalizationEngine=cudnn
+
+
+
+08/22/2016 16:47:01: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/22/2016 16:47:01: RootDir = "."
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+ndlMacros = "$ConfigDir$/Macros.ndl"
+precision = "float"
+deviceId = 0
+imageLayout = "cudnn"
+batchNormalizationEngine = "testMustOverrideBatchNormalizationEngine"
+initOnCPUOnly=true
+command = Train:Test
+traceLevel = 1
+numMBsToShowResult = 500
+Train = [
+    action = "train"
+    modelPath = "$ModelDir$/02_BatchNormConv"
+     NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/02_BatchNormConv.ndl"
+    ]
+    SGD = [
+        epochSize = 1024
+        minibatchSize = 64
+        learningRatesPerMB = 0.03*7:0.01
+        momentumPerMB = 0
+        maxEpochs = 2
+        L2RegWeight = 0
+        dropoutRate = 0
+    ]
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Train_cntk_text.txt"
+        input = [
+            features = [
+                dim = 3072
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]    
+]
+Test = [
+    action = "test"
+    modelPath = "$ModelDir$/02_BatchNormConv"
+    minibatchSize = 16
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Test_cntk_text.txt"
+        input = [
+            features = [
+                dim = 3072
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]    
+]
+currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData
+RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu
+DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData
+ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial
+OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu
+DeviceId=0
+timestamping=true
+batchNormalizationEngine=cudnn
+
+08/22/2016 16:47:01: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+08/22/2016 16:47:01: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/22/2016 16:47:01: RootDir = "."
+ConfigDir = "."
+DataDir = "."
+OutputDir = "./Output"
+ModelDir = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models"
+ndlMacros = "C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial/Macros.ndl"
+precision = "float"
+deviceId = 0
+imageLayout = "cudnn"
+batchNormalizationEngine = "testMustOverrideBatchNormalizationEngine"
+initOnCPUOnly=true
+command = Train:Test
+traceLevel = 1
+numMBsToShowResult = 500
+Train = [
+    action = "train"
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models/02_BatchNormConv"
+     NDLNetworkBuilder = [
+        networkDescription = "C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial/02_BatchNormConv.ndl"
+    ]
+    SGD = [
+        epochSize = 1024
+        minibatchSize = 64
+        learningRatesPerMB = 0.03*7:0.01
+        momentumPerMB = 0
+        maxEpochs = 2
+        L2RegWeight = 0
+        dropoutRate = 0
+    ]
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData/Train_cntk_text.txt"
+        input = [
+            features = [
+                dim = 3072
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]    
+]
+Test = [
+    action = "test"
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models/02_BatchNormConv"
+    minibatchSize = 16
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData/Test_cntk_text.txt"
+        input = [
+            features = [
+                dim = 3072
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]    
+]
+currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData
+RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu
+DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData
+ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial
+OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu
+DeviceId=0
+timestamping=true
+batchNormalizationEngine=cudnn
+
+08/22/2016 16:47:01: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+08/22/2016 16:47:01: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: 02_BatchNormConv.cntk:batchNormalizationEngine=cudnn
+configparameters: 02_BatchNormConv.cntk:command=Train:Test
+configparameters: 02_BatchNormConv.cntk:ConfigDir=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial
+configparameters: 02_BatchNormConv.cntk:currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData
+configparameters: 02_BatchNormConv.cntk:DataDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData
+configparameters: 02_BatchNormConv.cntk:deviceId=0
+configparameters: 02_BatchNormConv.cntk:imageLayout=cudnn
+configparameters: 02_BatchNormConv.cntk:initOnCPUOnly=true
+configparameters: 02_BatchNormConv.cntk:ModelDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models
+configparameters: 02_BatchNormConv.cntk:ndlMacros=C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial/Macros.ndl
+configparameters: 02_BatchNormConv.cntk:numMBsToShowResult=500
+configparameters: 02_BatchNormConv.cntk:OutputDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu
+configparameters: 02_BatchNormConv.cntk:precision=float
+configparameters: 02_BatchNormConv.cntk:RootDir=.
+configparameters: 02_BatchNormConv.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu
+configparameters: 02_BatchNormConv.cntk:Test=[
+    action = "test"
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models/02_BatchNormConv"
+    minibatchSize = 16
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData/Test_cntk_text.txt"
+        input = [
+            features = [
+                dim = 3072
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]    
+]
+
+configparameters: 02_BatchNormConv.cntk:timestamping=true
+configparameters: 02_BatchNormConv.cntk:traceLevel=1
+configparameters: 02_BatchNormConv.cntk:Train=[
+    action = "train"
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models/02_BatchNormConv"
+     NDLNetworkBuilder = [
+        networkDescription = "C:\Users\mahilleb\Repos\CNTK\Tests\EndToEndTests\BatchNormalization\Spatial/02_BatchNormConv.ndl"
+    ]
+    SGD = [
+        epochSize = 1024
+        minibatchSize = 64
+        learningRatesPerMB = 0.03*7:0.01
+        momentumPerMB = 0
+        maxEpochs = 2
+        L2RegWeight = 0
+        dropoutRate = 0
+    ]
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu\TestData/Train_cntk_text.txt"
+        input = [
+            features = [
+                dim = 3072
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
+        ]
+    ]    
+]
+
+08/22/2016 16:47:01: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/22/2016 16:47:01: Commands: Train Test
+08/22/2016 16:47:01: Precision = "float"
+08/22/2016 16:47:01: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models/02_BatchNormConv
+08/22/2016 16:47:01: CNTKCommandTrainInfo: Train : 2
+08/22/2016 16:47:01: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 2
+
+08/22/2016 16:47:01: ##############################################################################
+08/22/2016 16:47:01: #                                                                            #
+08/22/2016 16:47:01: # Action "train"                                                             #
+08/22/2016 16:47:01: #                                                                            #
+08/22/2016 16:47:01: ##############################################################################
+
+08/22/2016 16:47:01: CNTKCommandTrainBegin: Train
+NDLBuilder Using GPU 0
+
+08/22/2016 16:47:02: Creating virgin network.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 0.000000.
+Node 'b' (LearnableParameter operation): Initializing Parameter[3 x 1] <- 0.000000.
+Node 'sc' (LearnableParameter operation): Initializing Parameter[3 x 1] <- 0.000000.
+Node 'm' (LearnableParameter operation): Initializing Parameter[3 x 1] <- 0.000000.
+Node 'var' (LearnableParameter operation): Initializing Parameter[3 x 1] <- 0.000000.
+Node 'h1.W' (LearnableParameter operation): Initializing Parameter[64 x 15 x 15 x 3] <- 0.000000.
+Node 'h1.b' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'OutputNodes.W' (LearnableParameter operation): Initializing Parameter[10 x 64] <- 0.000000.
+Node 'OutputNodes.b' (LearnableParameter operation): Initializing Parameter[10] <- 0.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'b' (LearnableParameter operation): Initializing Parameter[3 x 1] <- 0.000000.
+Node 'sc' (LearnableParameter operation): Initializing Parameter[3 x 1] <- 1.000000.
+Node 'm' (LearnableParameter operation): Initializing Parameter[3 x 1] <- 0.000000.
+Node 'var' (LearnableParameter operation): Initializing Parameter[3 x 1] <- 0.000000.
+Node 'h1.W' (LearnableParameter operation): Initializing Parameter[64 x 15 x 15 x 3] <- gaussian(seed=1, range=0.007698*12.000000, onCPU=false).
+Microsoft::MSR::CNTK::GPUMatrix<ElemType>::SetGaussianRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4
+Node 'h1.b' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'OutputNodes.W' (LearnableParameter operation): Initializing Parameter[10 x 64] <- gaussian(seed=2, range=0.025000*1.500000, onCPU=false).
+Node 'OutputNodes.b' (LearnableParameter operation): Initializing Parameter[10] <- 0.000000.
+
+Post-processing network...
+
+3 roots:
+	CE = CrossEntropyWithSoftmax()
+	Err = ErrorPrediction()
+	OutputNodes.z = Plus()
+
+Validating network. 22 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [10 x *]
+Validating --> OutputNodes.W = LearnableParameter() :  -> [10 x 64]
+Validating --> h1.W = LearnableParameter() :  -> [64 x 15 x 15 x 3]
+Validating --> features = InputValue() :  -> [32 x 32 x 3 x *]
+Validating --> featOffs = LearnableParameter() :  -> [1 x 1]
+Validating --> featScaled = Minus (features, featOffs) : [32 x 32 x 3 x *], [1 x 1] -> [32 x 32 x 3 x *]
+Validating --> sc = LearnableParameter() :  -> [3 x 1]
+Validating --> b = LearnableParameter() :  -> [3 x 1]
+Validating --> m = LearnableParameter() :  -> [3 x 1]
+Validating --> var = LearnableParameter() :  -> [3 x 1]
+Validating --> y = BatchNormalization (featScaled, sc, b, m, var) : [32 x 32 x 3 x *], [3 x 1], [3 x 1], [3 x 1], [3 x 1] -> [32 x 32 x 3 x *]
+Validating --> conv1 = RectifiedLinear (y) : [32 x 32 x 3 x *] -> [32 x 32 x 3 x *]
+Validating --> pool1 = MaxPooling (conv1) : [32 x 32 x 3 x *] -> [15 x 15 x 3 x *]
+Validating --> h1.t = Times (h1.W, pool1) : [64 x 15 x 15 x 3], [15 x 15 x 3 x *] -> [64 x *]
+Validating --> h1.b = LearnableParameter() :  -> [64 x 1]
+Validating --> h1.z = Plus (h1.t, h1.b) : [64 x *], [64 x 1] -> [64 x 1 x *]
+Validating --> h1.y = RectifiedLinear (h1.z) : [64 x 1 x *] -> [64 x 1 x *]
+Validating --> OutputNodes.t = Times (OutputNodes.W, h1.y) : [10 x 64], [64 x 1 x *] -> [10 x 1 x *]
+Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
+Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x 1 x *], [10] -> [10 x 1 x *]
+Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *], [10 x 1 x *] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *], [10 x 1 x *] -> [1]
+
+Validating network. 11 nodes to process in pass 2.
+
+
+Validating network, final pass.
+
+
+Using cuDNN batch normalization engine.
+
+pool1: using cuDNN convolution engine for geometry: Input: 32 x 32 x 3, Output: 15 x 15 x 3, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+
+
+11 out of 22 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+08/22/2016 16:47:02: Created model with 22 nodes on GPU 0.
+
+08/22/2016 16:47:02: Training criterion node(s):
+08/22/2016 16:47:02: 	CE = CrossEntropyWithSoftmax
+
+08/22/2016 16:47:02: Evaluation criterion node(s):
+08/22/2016 16:47:02: 	Err = ErrorPrediction
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Memory Sharing: Out of 37 matrices, 18 are shared as 8, and 19 are not shared.
+
+	{ OutputNodes.t : [10 x 1 x *]
+	  h1.z : [64 x 1 x *] (gradient)
+	  pool1 : [15 x 15 x 3 x *] (gradient) }
+	{ h1.W : [64 x 15 x 15 x 3] (gradient)
+	  h1.z : [64 x 1 x *] }
+	{ h1.b : [64 x 1] (gradient)
+	  h1.y : [64 x 1 x *] (gradient) }
+	{ b : [3 x 1] (gradient)
+	  conv1 : [32 x 32 x 3 x *] (gradient)
+	  h1.t : [64 x *] }
+	{ OutputNodes.W : [10 x 64] (gradient)
+	  OutputNodes.z : [10 x 1 x *] (gradient) }
+	{ conv1 : [32 x 32 x 3 x *]
+	  sc : [3 x 1] (gradient) }
+	{ h1.t : [64 x *] (gradient)
+	  h1.y : [64 x 1 x *] }
+	{ pool1 : [15 x 15 x 3 x *]
+	  y : [32 x 32 x 3 x *] (gradient) }
+
+
+08/22/2016 16:47:02: Training 43920 parameters in 6 out of 6 parameter tensors and 15 nodes with gradient:
+
+08/22/2016 16:47:02: 	Node 'OutputNodes.W' (LearnableParameter operation) : [10 x 64]
+08/22/2016 16:47:02: 	Node 'OutputNodes.b' (LearnableParameter operation) : [10]
+08/22/2016 16:47:02: 	Node 'b' (LearnableParameter operation) : [3 x 1]
+08/22/2016 16:47:02: 	Node 'h1.W' (LearnableParameter operation) : [64 x 15 x 15 x 3]
+08/22/2016 16:47:02: 	Node 'h1.b' (LearnableParameter operation) : [64 x 1]
+08/22/2016 16:47:02: 	Node 'sc' (LearnableParameter operation) : [3 x 1]
+
+08/22/2016 16:47:02: No PreCompute nodes found, or all already computed. Skipping pre-computation step.
+
+08/22/2016 16:47:02: Starting Epoch 1: learning rate per sample = 0.000469  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+BlockRandomizer::StartEpoch: epoch 0: frames [0..1024] (first sequence at sample 0), data subset 0 of 1
+
+08/22/2016 16:47:02: Starting minibatch loop.
+08/22/2016 16:47:06: Finished Epoch[ 1 of 2]: [Training] CE = 2.32421112 * 1024; Err = 0.87109375 * 1024; totalSamplesSeen = 1024; learningRatePerSample = 0.00046874999; epochTime=4.11732s
+08/22/2016 16:47:06: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models/02_BatchNormConv.1'
+
+08/22/2016 16:47:06: Starting Epoch 2: learning rate per sample = 0.000469  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+BlockRandomizer::StartEpoch: epoch 1: frames [1024..2048] (first sequence at sample 1024), data subset 0 of 1
+
+08/22/2016 16:47:06: Starting minibatch loop.
+08/22/2016 16:47:06: Finished Epoch[ 2 of 2]: [Training] CE = 2.24204946 * 1024; Err = 0.84375000 * 1024; totalSamplesSeen = 2048; learningRatePerSample = 0.00046874999; epochTime=0.053071s
+08/22/2016 16:47:06: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160822174612.125245\BatchNormalization\Spatial_CuDNN@release_gpu/Models/02_BatchNormConv'
+08/22/2016 16:47:06: CNTKCommandTrainEnd: Train
+
+08/22/2016 16:47:06: Action "train" complete.
+
+
+08/22/2016 16:47:06: ##############################################################################
+08/22/2016 16:47:06: #                                                                            #
+08/22/2016 16:47:06: # Action "test"                                                              #
+08/22/2016 16:47:06: #                                                                            #
+08/22/2016 16:47:06: ##############################################################################
+
+INFO: y: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model
+
+Post-processing network...
+
+3 roots:
+	CE = CrossEntropyWithSoftmax()
+	Err = ErrorPrediction()
+	OutputNodes.z = Plus()
+
+Validating network. 22 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [10 x *1]
+Validating --> OutputNodes.W = LearnableParameter() :  -> [10 x 64]
+Validating --> h1.W = LearnableParameter() :  -> [64 x 15 x 15 x 3]
+Validating --> features = InputValue() :  -> [32 x 32 x 3 x *1]
+Validating --> featOffs = LearnableParameter() :  -> [1 x 1]
+Validating --> featScaled = Minus (features, featOffs) : [32 x 32 x 3 x *1], [1 x 1] -> [32 x 32 x 3 x *1]
+Validating --> sc = LearnableParameter() :  -> [3 x 1]
+Validating --> b = LearnableParameter() :  -> [3 x 1]
+Validating --> m = LearnableParameter() :  -> [3 x 1]
+Validating --> var = LearnableParameter() :  -> [3 x 1]
+Validating --> y = BatchNormalization (featScaled, sc, b, m, var) : [32 x 32 x 3 x *1], [3 x 1], [3 x 1], [3 x 1], [3 x 1] -> [32 x 32 x 3 x *1]
+Validating --> conv1 = RectifiedLinear (y) : [32 x 32 x 3 x *1] -> [32 x 32 x 3 x *1]
+Validating --> pool1 = MaxPooling (conv1) : [32 x 32 x 3 x *1] -> [15 x 15 x 3 x *1]
+Validating --> h1.t = Times (h1.W, pool1) : [64 x 15 x 15 x 3], [15 x 15 x 3 x *1] -> [64 x *1]
+Validating --> h1.b = LearnableParameter() :  -> [64 x 1]
+Validating --> h1.z = Plus (h1.t, h1.b) : [64 x *1], [64 x 1] -> [64 x 1 x *1]
+Validating --> h1.y = RectifiedLinear (h1.z) : [64 x 1 x *1] -> [64 x 1 x *1]
+Validating --> OutputNodes.t = Times (OutputNodes.W, h1.y) : [10 x 64], [64 x 1 x *1] -> [10 x 1 x *1]
+Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
+Validating --> OutputNodes.z = Plus (OutputNodes.t, OutputNodes.b) : [10 x 1 x *1], [10] -> [10 x 1 x *1]
+Validating --> CE = CrossEntropyWithSoftmax (labels, OutputNodes.z) : [10 x *1], [10 x 1 x *1] -> [1]
+Validating --> Err = ErrorPrediction (labels, OutputNodes.z) : [10 x *1], [10 x 1 x *1] -> [1]
+
+Validating network. 11 nodes to process in pass 2.
+
+
+Validating network, final pass.
+
+
+Using cuDNN batch normalization engine.
+
+pool1: using cuDNN convolution engine for geometry: Input: 32 x 32 x 3, Output: 15 x 15 x 3, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+
+
+11 out of 22 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Memory Sharing: Out of 22 matrices, 0 are shared as 0, and 22 are not shared.
+
+
+BlockRandomizer::StartEpoch: epoch 0: frames [0..10000] (first sequence at sample 0), data subset 0 of 1
+08/22/2016 16:47:08: Minibatch[1-500]: Err = 0.81025000 * 8000; CE = 2.19966818 * 8000
+08/22/2016 16:47:08: Minibatch[501-625]: Err = 0.82000000 * 2000; CE = 2.20878254 * 2000
+08/22/2016 16:47:08: Final Results: Minibatch[1-625]: Err = 0.81220000 * 10000; CE = 2.20149105 * 10000; perplexity = 9.03848028
+
+08/22/2016 16:47:08: Action "test" complete.
+
+08/22/2016 16:47:08: __COMPLETED__
\ No newline at end of file
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/run-test b/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/run-test
new file mode 100755
index 000000000000..95357ac32f8b
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/run-test
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+. $TEST_DIR/../run-test-common
+
+cntkrun 02_BatchNormConv.cntk batchNormalizationEngine=cudnn
+ExitCode=$?
+
+# Delete the test data if copied
+[[ "$Copied" -eq "1" ]] && rm -rf "$DataDir"
+
+exit $ExitCode
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/testcases.yml b/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/testcases.yml
new file mode 100644
index 000000000000..459fd20b5194
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/testcases.yml
@@ -0,0 +1,31 @@
+dataDir: ../../../../../Examples/Image/Miscellaneous/CIFAR-10
+
+tags:
+    # CPU training for BatchNormalization not supported.
+    - bvt-e (build_sku=='gpu') and (device=='gpu') and (flavor=='release')
+    - nightly-e (build_sku=='gpu') and (device=='gpu')
+
+testCases:
+  CNTK Run must be completed:
+    patterns:
+      - __COMPLETED__
+
+  Must train epochs in exactly same order and parameters:
+    patterns:
+      - Starting Epoch {{integer}}
+      - learning rate per sample = {{float}}
+      - momentum = {{float}}
+
+  Epochs must be finished with expected results:
+    patterns:
+      - Finished Epoch
+      - CE = {{float,tolerance=2.0%}} * {{integer}}
+      - Err = {{float,tolerance=2.0%}} * {{integer}}
+      - totalSamplesSeen = {{integer}}
+      - learningRatePerSample = {{float,tolerance=0.001%}}
+
+  Final test results must match:
+    patterns:
+      - "Final Results: Minibatch[{{integer}}-{{integer}}]"
+      - Err = {{float,tolerance=2.0%}} * {{integer}}
+      - CE = {{float,tolerance=2.0%}} * {{integer}}
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/Macros.ndl b/Tests/EndToEndTests/BatchNormalization/Spatial/Macros.ndl
new file mode 100644
index 000000000000..6c783b86031c
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/Macros.ndl
@@ -0,0 +1,148 @@
+ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
+[
+    W = LearnableParameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
+    b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = $imageLayout$)
+    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
+    p = Plus(c, b)
+    y = RectifiedLinear(p)
+]
+
+ConvLocalReLULayer(inp, outMap, outWCount, inMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
+[
+    W = LearnableParameter(outWCount, inWCount, init = Gaussian, initValueScale = wScale)
+    b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = $imageLayout$)
+    c = Convolution(W, inp, {kW, kH, inMap}, mapCount = outMap, stride = {hStride, vStride, inMap}, sharing = {false, false, false}, imageLayout = $imageLayout$)
+    p = Plus(c, b)
+    y = RectifiedLinear(p)
+]
+
+ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
+[
+    b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
+    sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
+    m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    
+    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
+    y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
+]
+
+ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
+[
+    W = LearnableParameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
+    c = ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
+]
+
+ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
+[
+    c = ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
+    y = RectifiedLinear(c)
+]
+
+ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
+[
+    b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
+    sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
+    m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    
+    c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
+    y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
+]
+
+ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
+[
+    # First convolution layer.
+    c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
+    # Second convolution layer, no ReLU.
+    c2 = ConvBNLayer(c1, outMap, inWCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
+    p = Plus(c2, inp)
+    y = RectifiedLinear(p)
+]
+
+ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, bnTimeConst, Wproj)
+[
+    # First convolution layer.
+    c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 2, 2, wScale, bValue, scValue, bnTimeConst)
+    # Second convolution layer, no ReLU.
+    c2 = ConvBNLayer(c1, outMap, wCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
+    
+    # Projection convolution layer.
+    c_proj = ProjLayer(Wproj, inp, outMap, 2, 2, bValue, scValue, bnTimeConst)
+    #c_proj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = $imageLayout$)
+    
+    p = Plus(c2, c_proj)
+    y = RectifiedLinear(p)
+]
+
+ResNetNode2Inc2(inp, inMap, outMap, inWCount, wCount, kW, kH, wScale, w1Scale, bValue, scValue, bnTimeConst)
+[
+    pool = MaxPooling(inp, 1, 1, 2, 2, imageLayout = $imageLayout$)
+    # First convolution layer.
+    c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 2, 2, wScale, bValue, scValue, bnTimeConst)
+    # Second convolution layer, no ReLU.
+    c2 = ConvBNLayer(c1, inMap, wCount, kW, kH, 1, 1, w1Scale, bValue, scValue, bnTimeConst)
+    c3 = ConvBNLayer(c1, inMap, wCount, kW, kH, 1, 1, w1Scale, bValue, scValue, bnTimeConst)
+    
+    p = Plus(c2, pool)
+    r = RowStack(p, c3)
+    y = RectifiedLinear(r)
+]
+
+DnnReLULayer(inDim, outDim, x, wScale, bValue)
+[
+    W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
+    b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
+    t = Times(W, x)
+    z = Plus(t, b)
+    y = RectifiedLinear(z)
+]
+
+DNNImageReLULayer(inW, inH, inC, outDim, x, wScale, bValue)
+[
+    W = ImageParameter(outDim, inW, inH, inC, init = Gaussian,   initValueScale = wScale, imageLayout=$imageLayout$) 
+    b = LearnableParameter(outDim, 1,         init = fixedValue, value = bValue) 
+    t = Times(W, x)
+    z = Plus(t, b)
+    y = RectifiedLinear(z)
+]
+
+DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
+[
+    W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
+    b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
+    sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
+    m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    t = Times(W, x)
+    bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
+    y = RectifiedLinear(bn)
+]
+
+DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeConst)
+[
+    W = ImageParameter(outDim, inW, inH, inC, init = Gaussian, initValueScale = wScale, imageLayout=$imageLayout$)
+    b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
+    sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
+    m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    t = Times(W, x)
+    bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
+    y = RectifiedLinear(bn)
+]
+
+DnnLastLayer(hiddenDim, labelDim, x, wScale, bValue)
+[
+    W = LearnableParameter(labelDim, hiddenDim, init = Gaussian, initValueScale = wScale)
+    b = LearnableParameter(labelDim, init = fixedValue, value = bValue)
+    t = Times(W, x)
+    z = Plus(t, b)
+]
+
+DnnImageLastLayer(inW, inH, inC, labelDim, x, wScale, bValue)
+[
+    W = ImageParameter(labelDim, inW, inH, inC, init = Gaussian, initValueScale = wScale, imageLayout=$imageLayout$)
+    b = LearnableParameter(labelDim, init = fixedValue, value = bValue)
+    t = Times(W, x)
+    z = Plus(t, b)
+]
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/run-test-common b/Tests/EndToEndTests/BatchNormalization/Spatial/run-test-common
new file mode 100755
index 000000000000..b9a7ea16b6fb
--- /dev/null
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/run-test-common
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+. $TEST_ROOT_DIR/run-test-common
+
+ConfigDir=$TEST_DIR/..
+
+if [[ ! -d $TEST_DATA_DIR || ! -e $TEST_DATA_DIR/Train_cntk_text.txt || ! -e $TEST_DATA_DIR/Test_cntk_text.txt ]]; then
+    # Cannot find test data locally.
+    # Try external test data directory (not part of the CNTK repository) as an alternative.
+    if [[ -d "$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY" ]]; then
+        if [ "$OS" == "Windows_NT" ]; then
+            DataSourceDir=`cygpath -au $CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY`/Image/CIFAR/v0
+        else
+            DataSourceDir=$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY/Image/CIFAR/v0
+        fi
+
+        # Copy the test data to the test run directory
+        DataDir=$TEST_RUN_DIR/TestData
+        mkdir $DataDir
+        cp -R $DataSourceDir/*_cntk_text.txt $DataDir || exit $?
+        Copied=1
+    else
+        echo Error: cannot find data. Please see Examples/Image/Miscellaneous/CIFAR-10/README.md for instructions to get it.
+        exit 1
+    fi
+fi
diff --git a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/baseline.windows.txt b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/baseline.windows.txt
index 4a0199cb5bc9..78db0e475a77 100644
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/baseline.windows.txt
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/baseline.windows.txt
@@ -1,47 +1,59 @@
-=== Running /cygdrive/c/jenkins/workspace/CNTK-Test-Windows-W1/x64/release/cntk.exe configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Image\Miscellaneous\CIFAR-10/../../../../Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/Config/02_BatchNormConv.cntk currentDirectory=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu DataDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Image\Miscellaneous\CIFAR-10 OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu DeviceId=0 timestamping=true Train=[SGD=[maxEpochs=5]] Train=[SGD=[epochSize=100]] stderr=-
+CPU info:
+    CPU Model Name: Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz
+    Hardware threads: 8
+    Total Memory: 33417320 kB
+-------------------------------------------------------------------
+=== Running /cygdrive/c/Users/mahilleb/Repos/CNTK/x64/release/cntk.exe configFile=C:\Users\mahilleb\Repos\CNTK\Examples\Image\Miscellaneous\CIFAR-10/02_BatchNormConv.cntk currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData RunDir=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu DataDir=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData ConfigDir=C:\Users\mahilleb\Repos\CNTK\Examples\Image\Miscellaneous\CIFAR-10 OutputDir=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu DeviceId=0 timestamping=true Train=[SGD=[maxEpochs=5]] Train=[SGD=[epochSize=100]] stderr=-
 -------------------------------------------------------------------
 Build info: 
 
-		Built time: May 13 2016 08:06:01
-		Last modified date: Thu May 12 07:31:50 2016
+		Built time: Aug 22 2016 17:36:51
+		Last modified date: Fri Aug 19 10:26:01 2016
 		Build type: Release
 		Build target: GPU
-		With 1bit-SGD: no
+		With 1bit-SGD: yes
+		Math lib: mkl
 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
-		CUB_PATH: c:\src\cub-1.4.1
-		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
-		Build Branch: HEAD
-		Build SHA1: 35fadc316f045d843bbd9b85061250a959268787
-		Built by svcphil on Philly-Pool3
-		Build Path: c:\Jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
+		CUB_PATH: C:\R\cub-1.4.1
+		CUDNN_PATH: C:\R\cudnn-7.5-windows10-x64-v5.0-ga\cuda
+		Build Branch: mahilleb/CuDnn5Test
+		Build SHA1: db500985aff6d7d67b90c1d0dedcbcd7f8ae7b96 (modified)
+		Built by mahilleb on mahilleb42
+		Build Path: C:\Users\mahilleb\Repos\CNTK\Source\CNTK\
 -------------------------------------------------------------------
-Changed current directory to C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData
-05/13/2016 08:18:23: Redirecting stderr to file -_Train_Test.log
-05/13/2016 08:18:23: -------------------------------------------------------------------
-05/13/2016 08:18:23: Build info: 
-
-05/13/2016 08:18:23: 		Built time: May 13 2016 08:06:01
-05/13/2016 08:18:23: 		Last modified date: Thu May 12 07:31:50 2016
-05/13/2016 08:18:23: 		Build type: Release
-05/13/2016 08:18:23: 		Build target: GPU
-05/13/2016 08:18:23: 		With 1bit-SGD: no
-05/13/2016 08:18:23: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
-05/13/2016 08:18:23: 		CUB_PATH: c:\src\cub-1.4.1
-05/13/2016 08:18:23: 		CUDNN_PATH: c:\NVIDIA\cudnn-4.0\cuda
-05/13/2016 08:18:23: 		Build Branch: HEAD
-05/13/2016 08:18:23: 		Build SHA1: 35fadc316f045d843bbd9b85061250a959268787
-05/13/2016 08:18:23: 		Built by svcphil on Philly-Pool3
-05/13/2016 08:18:23: 		Build Path: c:\Jenkins\workspace\CNTK-Build-Windows\Source\CNTK\
-05/13/2016 08:18:23: -------------------------------------------------------------------
-
-05/13/2016 08:18:23: Running on Philly-Pool2 at 2016/05/13 08:18:23
-05/13/2016 08:18:23: Command line: 
-C:\jenkins\workspace\CNTK-Test-Windows-W1\x64\release\cntk.exe  configFile=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Image\Miscellaneous\CIFAR-10/../../../../Tests/EndToEndTests/CNTKTextFormatReader/Examples/Image/Miscellaneous/CIFAR-10/Config/02_BatchNormConv.cntk  currentDirectory=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData  RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu  DataDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData  ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Image\Miscellaneous\CIFAR-10  OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu  DeviceId=0  timestamping=true  Train=[SGD=[maxEpochs=5]]  Train=[SGD=[epochSize=100]]  stderr=-
-
-
-
-05/13/2016 08:18:23: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
-05/13/2016 08:18:23: RootDir = "."
+Changed current directory to C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData
+08/22/2016 16:48:05: Redirecting stderr to file -_Train_Test.log
+08/22/2016 16:48:05: -------------------------------------------------------------------
+08/22/2016 16:48:05: Build info: 
+
+08/22/2016 16:48:05: 		Built time: Aug 22 2016 17:36:51
+08/22/2016 16:48:05: 		Last modified date: Fri Aug 19 10:26:01 2016
+08/22/2016 16:48:05: 		Build type: Release
+08/22/2016 16:48:05: 		Build target: GPU
+08/22/2016 16:48:05: 		With 1bit-SGD: yes
+08/22/2016 16:48:05: 		Math lib: mkl
+08/22/2016 16:48:05: 		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5
+08/22/2016 16:48:05: 		CUB_PATH: C:\R\cub-1.4.1
+08/22/2016 16:48:05: 		CUDNN_PATH: C:\R\cudnn-7.5-windows10-x64-v5.0-ga\cuda
+08/22/2016 16:48:05: 		Build Branch: mahilleb/CuDnn5Test
+08/22/2016 16:48:05: 		Build SHA1: db500985aff6d7d67b90c1d0dedcbcd7f8ae7b96 (modified)
+08/22/2016 16:48:05: 		Built by mahilleb on mahilleb42
+08/22/2016 16:48:05: 		Build Path: C:\Users\mahilleb\Repos\CNTK\Source\CNTK\
+08/22/2016 16:48:05: -------------------------------------------------------------------
+08/22/2016 16:48:05: -------------------------------------------------------------------
+08/22/2016 16:48:05: GPU info:
+
+08/22/2016 16:48:05: 		Device[0]: cores = 960; computeCapability = 5.0; type = "Quadro M2000M"; memory = 4096 MB
+08/22/2016 16:48:05: -------------------------------------------------------------------
+
+08/22/2016 16:48:05: Running on mahilleb42 at 2016/08/22 16:48:05
+08/22/2016 16:48:05: Command line: 
+C:\Users\mahilleb\Repos\CNTK\x64\release\cntk.exe  configFile=C:\Users\mahilleb\Repos\CNTK\Examples\Image\Miscellaneous\CIFAR-10/02_BatchNormConv.cntk  currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData  RunDir=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu  DataDir=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData  ConfigDir=C:\Users\mahilleb\Repos\CNTK\Examples\Image\Miscellaneous\CIFAR-10  OutputDir=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu  DeviceId=0  timestamping=true  Train=[SGD=[maxEpochs=5]]  Train=[SGD=[epochSize=100]]  stderr=-
+
+
+
+08/22/2016 16:48:05: >>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+08/22/2016 16:48:05: RootDir = "."
 ConfigDir = "$RootDir$"
 DataDir = "$RootDir$"
 OutputDir = "$RootDir$/Output"
@@ -51,7 +63,6 @@ precision = "float"
 deviceId = 0
 imageLayout = "cudnn"
 initOnCPUOnly=true
-prefetch = "true"
 command = Train:Test
 stderr = "$OutputDir$/02_BatchNormConv"
 traceLevel = 1
@@ -84,7 +95,7 @@ Train = [
                 format = "dense"
             ]
         ]
-    ]
+    ]    
 ]
 Test = [
     action = "test"
@@ -103,42 +114,41 @@ Test = [
                 format = "dense"
             ]
         ]
-    ]
+    ]    
 ]
-currentDirectory=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData
-RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu
-DataDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData
-ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Image\Miscellaneous\CIFAR-10
-OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu
+currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData
+RunDir=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu
+DataDir=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData
+ConfigDir=C:\Users\mahilleb\Repos\CNTK\Examples\Image\Miscellaneous\CIFAR-10
+OutputDir=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu
 DeviceId=0
 timestamping=true
 Train=[SGD=[maxEpochs=5]]
 Train=[SGD=[epochSize=100]]
 stderr=-
 
-05/13/2016 08:18:23: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+08/22/2016 16:48:05: <<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
 
-05/13/2016 08:18:23: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
-05/13/2016 08:18:23: RootDir = "."
+08/22/2016 16:48:05: >>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/22/2016 16:48:05: RootDir = "."
 ConfigDir = "."
 DataDir = "."
 OutputDir = "./Output"
-ModelDir = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models"
-ndlMacros = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Image\Miscellaneous\CIFAR-10/Macros.ndl"
+ModelDir = "C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models"
+ndlMacros = "C:\Users\mahilleb\Repos\CNTK\Examples\Image\Miscellaneous\CIFAR-10/Macros.ndl"
 precision = "float"
 deviceId = 0
 imageLayout = "cudnn"
 initOnCPUOnly=true
-prefetch = "true"
 command = Train:Test
-stderr = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/02_BatchNormConv"
+stderr = "C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/02_BatchNormConv"
 traceLevel = 1
 numMBsToShowResult = 500
 Train = [
     action = "train"
-    modelPath = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv"
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv"
      NDLNetworkBuilder = [
-        networkDescription = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Image\Miscellaneous\CIFAR-10/02_BatchNormConv.ndl"
+        networkDescription = "C:\Users\mahilleb\Repos\CNTK\Examples\Image\Miscellaneous\CIFAR-10/02_BatchNormConv.ndl"
     ]
     SGD = [
         epochSize = 49984
@@ -151,7 +161,7 @@ Train = [
     ]
     reader = [
         readerType = "CNTKTextFormatReader"
-        file = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData/Train_cntk_text.txt"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData/Train_cntk_text.txt"
         input = [
             features = [
                 dim = 3072
@@ -162,15 +172,15 @@ Train = [
                 format = "dense"
             ]
         ]
-    ]
+    ]    
 ]
 Test = [
     action = "test"
-    modelPath = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv"
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv"
     minibatchSize = 16
     reader = [
         readerType = "CNTKTextFormatReader"
-        file = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData/Test_cntk_text.txt"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData/Test_cntk_text.txt"
         input = [
             features = [
                 dim = 3072
@@ -181,45 +191,44 @@ Test = [
                 format = "dense"
             ]
         ]
-    ]
+    ]    
 ]
-currentDirectory=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData
-RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu
-DataDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData
-ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Image\Miscellaneous\CIFAR-10
-OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu
+currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData
+RunDir=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu
+DataDir=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData
+ConfigDir=C:\Users\mahilleb\Repos\CNTK\Examples\Image\Miscellaneous\CIFAR-10
+OutputDir=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu
 DeviceId=0
 timestamping=true
 Train=[SGD=[maxEpochs=5]]
 Train=[SGD=[epochSize=100]]
 stderr=-
 
-05/13/2016 08:18:23: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/22/2016 16:48:05: <<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
 
-05/13/2016 08:18:23: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+08/22/2016 16:48:05: >>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
 configparameters: 02_BatchNormConv.cntk:command=Train:Test
-configparameters: 02_BatchNormConv.cntk:ConfigDir=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Image\Miscellaneous\CIFAR-10
-configparameters: 02_BatchNormConv.cntk:currentDirectory=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData
-configparameters: 02_BatchNormConv.cntk:DataDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData
+configparameters: 02_BatchNormConv.cntk:ConfigDir=C:\Users\mahilleb\Repos\CNTK\Examples\Image\Miscellaneous\CIFAR-10
+configparameters: 02_BatchNormConv.cntk:currentDirectory=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData
+configparameters: 02_BatchNormConv.cntk:DataDir=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData
 configparameters: 02_BatchNormConv.cntk:deviceId=0
 configparameters: 02_BatchNormConv.cntk:imageLayout=cudnn
 configparameters: 02_BatchNormConv.cntk:initOnCPUOnly=true
-configparameters: 02_BatchNormConv.cntk:ModelDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models
-configparameters: 02_BatchNormConv.cntk:ndlMacros=C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Image\Miscellaneous\CIFAR-10/Macros.ndl
+configparameters: 02_BatchNormConv.cntk:ModelDir=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models
+configparameters: 02_BatchNormConv.cntk:ndlMacros=C:\Users\mahilleb\Repos\CNTK\Examples\Image\Miscellaneous\CIFAR-10/Macros.ndl
 configparameters: 02_BatchNormConv.cntk:numMBsToShowResult=500
-configparameters: 02_BatchNormConv.cntk:OutputDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu
+configparameters: 02_BatchNormConv.cntk:OutputDir=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu
 configparameters: 02_BatchNormConv.cntk:precision=float
-configparameters: 02_BatchNormConv.cntk:prefetch=true
 configparameters: 02_BatchNormConv.cntk:RootDir=.
-configparameters: 02_BatchNormConv.cntk:RunDir=C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu
+configparameters: 02_BatchNormConv.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu
 configparameters: 02_BatchNormConv.cntk:stderr=-
 configparameters: 02_BatchNormConv.cntk:Test=[
     action = "test"
-    modelPath = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv"
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv"
     minibatchSize = 16
     reader = [
         readerType = "CNTKTextFormatReader"
-        file = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData/Test_cntk_text.txt"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData/Test_cntk_text.txt"
         input = [
             features = [
                 dim = 3072
@@ -230,16 +239,16 @@ configparameters: 02_BatchNormConv.cntk:Test=[
                 format = "dense"
             ]
         ]
-    ]
+    ]    
 ]
 
 configparameters: 02_BatchNormConv.cntk:timestamping=true
 configparameters: 02_BatchNormConv.cntk:traceLevel=1
 configparameters: 02_BatchNormConv.cntk:Train=[
     action = "train"
-    modelPath = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv"
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv"
      NDLNetworkBuilder = [
-        networkDescription = "C:\jenkins\workspace\CNTK-Test-Windows-W1\Examples\Image\Miscellaneous\CIFAR-10/02_BatchNormConv.ndl"
+        networkDescription = "C:\Users\mahilleb\Repos\CNTK\Examples\Image\Miscellaneous\CIFAR-10/02_BatchNormConv.ndl"
     ]
     SGD = [
         epochSize = 49984
@@ -252,7 +261,7 @@ configparameters: 02_BatchNormConv.cntk:Train=[
     ]
     reader = [
         readerType = "CNTKTextFormatReader"
-        file = "C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData/Train_cntk_text.txt"
+        file = "C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu\TestData/Train_cntk_text.txt"
         input = [
             features = [
                 dim = 3072
@@ -263,27 +272,75 @@ configparameters: 02_BatchNormConv.cntk:Train=[
                 format = "dense"
             ]
         ]
-    ]
+    ]    
 ] [SGD=[maxEpochs=5]] [SGD=[epochSize=100]]
 
-05/13/2016 08:18:23: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
-05/13/2016 08:18:23: Commands: Train Test
-05/13/2016 08:18:23: Precision = "float"
-05/13/2016 08:18:23: CNTKModelPath: C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv
-05/13/2016 08:18:23: CNTKCommandTrainInfo: Train : 5
-05/13/2016 08:18:23: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 5
+08/22/2016 16:48:06: <<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+08/22/2016 16:48:06: Commands: Train Test
+08/22/2016 16:48:06: Precision = "float"
+08/22/2016 16:48:06: CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv
+08/22/2016 16:48:06: CNTKCommandTrainInfo: Train : 5
+08/22/2016 16:48:06: CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 5
 
-05/13/2016 08:18:23: ##############################################################################
-05/13/2016 08:18:23: #                                                                            #
-05/13/2016 08:18:23: # Action "train"                                                             #
-05/13/2016 08:18:23: #                                                                            #
-05/13/2016 08:18:23: ##############################################################################
+08/22/2016 16:48:06: ##############################################################################
+08/22/2016 16:48:06: #                                                                            #
+08/22/2016 16:48:06: # Action "train"                                                             #
+08/22/2016 16:48:06: #                                                                            #
+08/22/2016 16:48:06: ##############################################################################
 
-05/13/2016 08:18:23: CNTKCommandTrainBegin: Train
+08/22/2016 16:48:06: CNTKCommandTrainBegin: Train
 NDLBuilder Using GPU 0
 
-05/13/2016 08:18:24: Creating virgin network.
+08/22/2016 16:48:06: Creating virgin network.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 0.000000.
+Node 'conv1.c.W' (LearnableParameter operation): Initializing Parameter[32 x 75] <- 0.000000.
+Node 'conv1.c.c.b' (LearnableParameter operation): Initializing Parameter[32 x 1] <- 0.000000.
+Node 'conv1.c.c.sc' (LearnableParameter operation): Initializing Parameter[32 x 1] <- 0.000000.
+Node 'conv1.c.c.m' (LearnableParameter operation): Initializing Parameter[32 x 1] <- 0.000000.
+Node 'conv1.c.c.var' (LearnableParameter operation): Initializing Parameter[32 x 1] <- 0.000000.
+Node 'conv2.c.W' (LearnableParameter operation): Initializing Parameter[32 x 800] <- 0.000000.
+Node 'conv2.c.c.b' (LearnableParameter operation): Initializing Parameter[32 x 1] <- 0.000000.
+Node 'conv2.c.c.sc' (LearnableParameter operation): Initializing Parameter[32 x 1] <- 0.000000.
+Node 'conv2.c.c.m' (LearnableParameter operation): Initializing Parameter[32 x 1] <- 0.000000.
+Node 'conv2.c.c.var' (LearnableParameter operation): Initializing Parameter[32 x 1] <- 0.000000.
+Node 'conv3.c.W' (LearnableParameter operation): Initializing Parameter[64 x 800] <- 0.000000.
+Node 'conv3.c.c.b' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'conv3.c.c.sc' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'conv3.c.c.m' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'conv3.c.c.var' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'h1.W' (LearnableParameter operation): Initializing Parameter[64 x 3 x 3 x 64] <- 0.000000.
+Node 'h1.b' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'h1.sc' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'h1.m' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'h1.var' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'OutputNodes.W' (LearnableParameter operation): Initializing Parameter[10 x 64] <- 0.000000.
+Node 'OutputNodes.b' (LearnableParameter operation): Initializing Parameter[10] <- 0.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'featOffs' (LearnableParameter operation): Initializing Parameter[1 x 1] <- 128.000000.
+Node 'conv1.c.W' (LearnableParameter operation): Initializing Parameter[32 x 75] <- gaussian(seed=1, range=0.023094*0.004300, onCPU=false).
 Microsoft::MSR::CNTK::GPUMatrix<ElemType>::SetGaussianRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4
+Node 'conv1.c.c.b' (LearnableParameter operation): Initializing Parameter[32 x 1] <- 0.000000.
+Node 'conv1.c.c.sc' (LearnableParameter operation): Initializing Parameter[32 x 1] <- 1.000000.
+Node 'conv1.c.c.m' (LearnableParameter operation): Initializing Parameter[32 x 1] <- 0.000000.
+Node 'conv1.c.c.var' (LearnableParameter operation): Initializing Parameter[32 x 1] <- 0.000000.
+Node 'conv2.c.W' (LearnableParameter operation): Initializing Parameter[32 x 800] <- gaussian(seed=2, range=0.007071*1.414000, onCPU=false).
+Node 'conv2.c.c.b' (LearnableParameter operation): Initializing Parameter[32 x 1] <- 0.000000.
+Node 'conv2.c.c.sc' (LearnableParameter operation): Initializing Parameter[32 x 1] <- 1.000000.
+Node 'conv2.c.c.m' (LearnableParameter operation): Initializing Parameter[32 x 1] <- 0.000000.
+Node 'conv2.c.c.var' (LearnableParameter operation): Initializing Parameter[32 x 1] <- 0.000000.
+Node 'conv3.c.W' (LearnableParameter operation): Initializing Parameter[64 x 800] <- gaussian(seed=3, range=0.007071*1.414000, onCPU=false).
+Node 'conv3.c.c.b' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'conv3.c.c.sc' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 1.000000.
+Node 'conv3.c.c.m' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'conv3.c.c.var' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'h1.W' (LearnableParameter operation): Initializing Parameter[64 x 3 x 3 x 64] <- gaussian(seed=4, range=0.008333*12.000000, onCPU=false).
+Node 'h1.b' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'h1.sc' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 1.000000.
+Node 'h1.m' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'h1.var' (LearnableParameter operation): Initializing Parameter[64 x 1] <- 0.000000.
+Node 'OutputNodes.W' (LearnableParameter operation): Initializing Parameter[10 x 64] <- gaussian(seed=5, range=0.025000*1.500000, onCPU=false).
+Node 'OutputNodes.b' (LearnableParameter operation): Initializing Parameter[10] <- 0.000000.
 
 Post-processing network...
 
@@ -307,32 +364,32 @@ Validating --> conv1.c.c.c = Convolution (conv1.c.W, featScaled) : [32 x 75], [3
 Validating --> conv1.c.c.sc = LearnableParameter() :  -> [32 x 1]
 Validating --> conv1.c.c.b = LearnableParameter() :  -> [32 x 1]
 Validating --> conv1.c.c.m = LearnableParameter() :  -> [32 x 1]
-Validating --> conv1.c.c.isd = LearnableParameter() :  -> [32 x 1]
-Validating --> conv1.c.c.y = BatchNormalization (conv1.c.c.c, conv1.c.c.sc, conv1.c.c.b, conv1.c.c.m, conv1.c.c.isd) : [32 x 32 x 32 x *], [32 x 1], [32 x 1], [32 x 1], [32 x 1] -> [32 x 32 x 32 x *]
+Validating --> conv1.c.c.var = LearnableParameter() :  -> [32 x 1]
+Validating --> conv1.c.c.y = BatchNormalization (conv1.c.c.c, conv1.c.c.sc, conv1.c.c.b, conv1.c.c.m, conv1.c.c.var) : [32 x 32 x 32 x *], [32 x 1], [32 x 1], [32 x 1], [32 x 1] -> [32 x 32 x 32 x *]
 Validating --> conv1.y = RectifiedLinear (conv1.c.c.y) : [32 x 32 x 32 x *] -> [32 x 32 x 32 x *]
 Validating --> pool1 = MaxPooling (conv1.y) : [32 x 32 x 32 x *] -> [15 x 15 x 32 x *]
 Validating --> conv2.c.c.c = Convolution (conv2.c.W, pool1) : [32 x 800], [15 x 15 x 32 x *] -> [15 x 15 x 32 x *]
 Validating --> conv2.c.c.sc = LearnableParameter() :  -> [32 x 1]
 Validating --> conv2.c.c.b = LearnableParameter() :  -> [32 x 1]
 Validating --> conv2.c.c.m = LearnableParameter() :  -> [32 x 1]
-Validating --> conv2.c.c.isd = LearnableParameter() :  -> [32 x 1]
-Validating --> conv2.c.c.y = BatchNormalization (conv2.c.c.c, conv2.c.c.sc, conv2.c.c.b, conv2.c.c.m, conv2.c.c.isd) : [15 x 15 x 32 x *], [32 x 1], [32 x 1], [32 x 1], [32 x 1] -> [15 x 15 x 32 x *]
+Validating --> conv2.c.c.var = LearnableParameter() :  -> [32 x 1]
+Validating --> conv2.c.c.y = BatchNormalization (conv2.c.c.c, conv2.c.c.sc, conv2.c.c.b, conv2.c.c.m, conv2.c.c.var) : [15 x 15 x 32 x *], [32 x 1], [32 x 1], [32 x 1], [32 x 1] -> [15 x 15 x 32 x *]
 Validating --> conv2.y = RectifiedLinear (conv2.c.c.y) : [15 x 15 x 32 x *] -> [15 x 15 x 32 x *]
 Validating --> pool2 = MaxPooling (conv2.y) : [15 x 15 x 32 x *] -> [7 x 7 x 32 x *]
 Validating --> conv3.c.c.c = Convolution (conv3.c.W, pool2) : [64 x 800], [7 x 7 x 32 x *] -> [7 x 7 x 64 x *]
 Validating --> conv3.c.c.sc = LearnableParameter() :  -> [64 x 1]
 Validating --> conv3.c.c.b = LearnableParameter() :  -> [64 x 1]
 Validating --> conv3.c.c.m = LearnableParameter() :  -> [64 x 1]
-Validating --> conv3.c.c.isd = LearnableParameter() :  -> [64 x 1]
-Validating --> conv3.c.c.y = BatchNormalization (conv3.c.c.c, conv3.c.c.sc, conv3.c.c.b, conv3.c.c.m, conv3.c.c.isd) : [7 x 7 x 64 x *], [64 x 1], [64 x 1], [64 x 1], [64 x 1] -> [7 x 7 x 64 x *]
+Validating --> conv3.c.c.var = LearnableParameter() :  -> [64 x 1]
+Validating --> conv3.c.c.y = BatchNormalization (conv3.c.c.c, conv3.c.c.sc, conv3.c.c.b, conv3.c.c.m, conv3.c.c.var) : [7 x 7 x 64 x *], [64 x 1], [64 x 1], [64 x 1], [64 x 1] -> [7 x 7 x 64 x *]
 Validating --> conv3.y = RectifiedLinear (conv3.c.c.y) : [7 x 7 x 64 x *] -> [7 x 7 x 64 x *]
 Validating --> pool3 = MaxPooling (conv3.y) : [7 x 7 x 64 x *] -> [3 x 3 x 64 x *]
 Validating --> h1.t = Times (h1.W, pool3) : [64 x 3 x 3 x 64], [3 x 3 x 64 x *] -> [64 x *]
 Validating --> h1.sc = LearnableParameter() :  -> [64 x 1]
 Validating --> h1.b = LearnableParameter() :  -> [64 x 1]
 Validating --> h1.m = LearnableParameter() :  -> [64 x 1]
-Validating --> h1.isd = LearnableParameter() :  -> [64 x 1]
-Validating --> h1.bn = BatchNormalization (h1.t, h1.sc, h1.b, h1.m, h1.isd) : [64 x *], [64 x 1], [64 x 1], [64 x 1], [64 x 1] -> [64 x *]
+Validating --> h1.var = LearnableParameter() :  -> [64 x 1]
+Validating --> h1.bn = BatchNormalization (h1.t, h1.sc, h1.b, h1.m, h1.var) : [64 x *], [64 x 1], [64 x 1], [64 x 1], [64 x 1] -> [64 x *]
 Validating --> h1.y = RectifiedLinear (h1.bn) : [64 x *] -> [64 x *]
 Validating --> OutputNodes.t = Times (OutputNodes.W, h1.y) : [10 x 64], [64 x *] -> [10 x *]
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
@@ -346,23 +403,23 @@ Validating network. 20 nodes to process in pass 2.
 Validating network, final pass.
 
 
-Using cuDNN convolution engine for geometry: Input: 32 x 32 x 3, Output: 32 x 32 x 32, Kernel: 5 x 5 x 3, Map: 1 x 1 x 32, Stride: 1 x 1 x 3, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv1.c.c.c: using cuDNN convolution engine for geometry: Input: 32 x 32 x 3, Output: 32 x 32 x 32, Kernel: 5 x 5 x 3, Map: 1 x 1 x 32, Stride: 1 x 1 x 3, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
 
 Using CNTK batch normalization engine.
 
-Using cuDNN convolution engine for geometry: Input: 32 x 32 x 32, Output: 15 x 15 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool1: using cuDNN convolution engine for geometry: Input: 32 x 32 x 32, Output: 15 x 15 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
 
-Using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 15 x 15 x 32, Kernel: 5 x 5 x 32, Map: 1 x 1 x 32, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv2.c.c.c: using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 15 x 15 x 32, Kernel: 5 x 5 x 32, Map: 1 x 1 x 32, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
 
 Using CNTK batch normalization engine.
 
-Using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 7 x 7 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool2: using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 7 x 7 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
 
-Using cuDNN convolution engine for geometry: Input: 7 x 7 x 32, Output: 7 x 7 x 64, Kernel: 5 x 5 x 32, Map: 1 x 1 x 64, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv3.c.c.c: using cuDNN convolution engine for geometry: Input: 7 x 7 x 32, Output: 7 x 7 x 64, Kernel: 5 x 5 x 32, Map: 1 x 1 x 64, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
 
 Using CNTK batch normalization engine.
 
-Using cuDNN convolution engine for geometry: Input: 7 x 7 x 64, Output: 3 x 3 x 64, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool3: using cuDNN convolution engine for geometry: Input: 7 x 7 x 64, Output: 3 x 3 x 64, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
 
 Using CNTK batch normalization engine.
 
@@ -371,119 +428,127 @@ Using CNTK batch normalization engine.
 
 Post-processing network complete.
 
-05/13/2016 08:18:26: Created model with 45 nodes on GPU 0.
+08/22/2016 16:48:07: Created model with 45 nodes on GPU 0.
 
-05/13/2016 08:18:26: Training criterion node(s):
-05/13/2016 08:18:26: 	CE = CrossEntropyWithSoftmax
+08/22/2016 16:48:07: Training criterion node(s):
+08/22/2016 16:48:07: 	CE = CrossEntropyWithSoftmax
 
-05/13/2016 08:18:26: Evaluation criterion node(s):
-
-05/13/2016 08:18:26: 	Err = ErrorPrediction
+08/22/2016 16:48:07: Evaluation criterion node(s):
+08/22/2016 16:48:07: 	Err = ErrorPrediction
 
 
 Allocating matrices for forward and/or backward propagation.
 
-Memory Sharing Structure:
-
-0000000000000000: {[Err Gradient[1]] [conv1.c.c.isd Gradient[32 x 1]] [conv1.c.c.m Gradient[32 x 1]] [conv2.c.c.isd Gradient[32 x 1]] [conv2.c.c.m Gradient[32 x 1]] [conv3.c.c.isd Gradient[64 x 1]] [conv3.c.c.m Gradient[64 x 1]] [featOffs Gradient[1 x 1]] [featScaled Gradient[32 x 32 x 3 x *]] [features Gradient[32 x 32 x 3 x *]] [h1.isd Gradient[64 x 1]] [h1.m Gradient[64 x 1]] [labels Gradient[10 x *]] }
-000000E89AC81140: {[conv3.c.c.sc Value[64 x 1]] }
-000000E89AC813C0: {[conv2.c.c.sc Value[32 x 1]] }
-000000E89AC815A0: {[conv2.c.c.b Value[32 x 1]] }
-000000E89AC81820: {[h1.isd Value[64 x 1]] }
-000000E89AC81A00: {[OutputNodes.W Value[10 x 64]] }
-000000E89AC81BE0: {[OutputNodes.b Value[10]] }
-000000E89AC81F00: {[h1.W Value[64 x 3 x 3 x 64]] }
-000000E89AC81FA0: {[h1.m Value[64 x 1]] }
-000000E89AC82180: {[conv2.c.c.m Value[32 x 1]] }
-000000E89AC822C0: {[conv3.c.c.isd Value[64 x 1]] }
-000000E89AC82540: {[h1.sc Value[64 x 1]] }
-000000E89AC825E0: {[conv2.c.c.isd Value[32 x 1]] }
-000000E89AC82680: {[conv3.c.c.m Value[64 x 1]] }
-000000E89AC82720: {[h1.b Value[64 x 1]] }
-000000E89AC82860: {[conv3.c.W Value[64 x 800]] }
-000000E89AC82AE0: {[conv2.c.W Value[32 x 800]] }
-000000E89AC82B80: {[conv3.c.c.b Value[64 x 1]] }
-000000E8A0568140: {[featScaled Value[32 x 32 x 3 x *]] }
-000000E8A05681E0: {[conv2.c.c.y Gradient[15 x 15 x 32 x *]] [pool2 Value[7 x 7 x 32 x *]] }
-000000E8A0568280: {[conv2.c.c.sc Gradient[32 x 1]] [conv2.y Gradient[15 x 15 x 32 x *]] }
-000000E8A0568320: {[conv3.c.c.y Value[7 x 7 x 64 x *]] }
-000000E8A0568460: {[conv2.c.c.b Gradient[32 x 1]] [conv3.c.c.c Gradient[7 x 7 x 64 x *]] [conv3.y Value[7 x 7 x 64 x *]] }
-000000E8A05685A0: {[OutputNodes.t Value[10 x *]] [h1.bn Gradient[64 x *]] }
-000000E8A0568A00: {[Err Value[1]] }
-000000E8A0568AA0: {[conv2.c.c.y Value[15 x 15 x 32 x *]] }
-000000E8A0568BE0: {[conv1.c.c.b Gradient[32 x 1]] [conv2.c.c.c Gradient[15 x 15 x 32 x *]] [conv2.y Value[15 x 15 x 32 x *]] }
-000000E8A0568D20: {[conv3.c.c.b Gradient[64 x 1]] }
-000000E8A0568DC0: {[conv3.c.c.sc Gradient[64 x 1]] [conv3.y Gradient[7 x 7 x 64 x *]] [h1.t Value[64 x *]] }
-000000E8A0568E60: {[conv3.c.W Gradient[64 x 800]] [h1.t Gradient[64 x *]] [h1.y Value[64 x *]] }
-000000E8A0569040: {[conv1.c.c.y Gradient[32 x 32 x 32 x *]] [pool1 Value[15 x 15 x 32 x *]] }
-000000E8A0569400: {[conv1.c.c.y Value[32 x 32 x 32 x *]] }
-000000E8A05694A0: {[conv2.c.W Gradient[32 x 800]] [conv3.c.c.c Value[7 x 7 x 64 x *]] }
-000000E8A0569540: {[OutputNodes.W Gradient[10 x 64]] [OutputNodes.z Gradient[10 x *]] }
-000000E8A0569680: {[OutputNodes.t Gradient[10 x *]] [pool1 Gradient[15 x 15 x 32 x *]] [pool2 Gradient[7 x 7 x 32 x *]] [pool3 Gradient[3 x 3 x 64 x *]] }
-000000E8A0569720: {[OutputNodes.b Gradient[10]] }
-000000E8A05697C0: {[h1.sc Gradient[64 x 1]] [h1.y Gradient[64 x *]] }
-000000E8A0569860: {[conv1.c.W Gradient[32 x 75]] [conv2.c.c.c Value[15 x 15 x 32 x *]] }
-000000E8A0569900: {[conv1.c.c.c Gradient[32 x 32 x 32 x *]] [conv1.y Value[32 x 32 x 32 x *]] }
-000000E8A05699A0: {[CE Gradient[1]] }
-000000E8A0569A40: {[h1.W Gradient[64 x 3 x 3 x 64]] }
-000000E8A0569B80: {[conv3.c.c.y Gradient[7 x 7 x 64 x *]] [pool3 Value[3 x 3 x 64 x *]] }
-000000E8A0569E00: {[h1.bn Value[64 x *]] }
-000000E8A0569FE0: {[h1.b Gradient[64 x 1]] }
-000000E8A056A120: {[conv1.c.c.sc Gradient[32 x 1]] [conv1.y Gradient[32 x 32 x 32 x *]] }
-000000E8A056A3A0: {[CE Value[1]] }
-000000E8A056A620: {[OutputNodes.z Value[10 x *]] }
-000000E8A056A760: {[conv1.c.c.c Value[32 x 32 x 32 x *]] }
-000000E8FC080980: {[featOffs Value[1 x 1]] }
-000000E8FC0811A0: {[conv1.c.W Value[32 x 75]] }
-000000E8FC081240: {[conv1.c.c.b Value[32 x 1]] }
-000000E8FC081740: {[conv1.c.c.sc Value[32 x 1]] }
-000000E8FC081920: {[labels Value[10 x *]] }
-000000E8FC081D80: {[features Value[32 x 32 x 3 x *]] }
-000000E8FC081EC0: {[conv1.c.c.m Value[32 x 1]] }
-000000E8FC081F60: {[conv1.c.c.isd Value[32 x 1]] }
-
-05/13/2016 08:18:26: No PreCompute nodes found, skipping PreCompute step.
-
-05/13/2016 08:18:26: Starting Epoch 1: learning rate per sample = 0.000469  effective momentum = 0.000000  momentum as time constant = 0.0 samples
-
-05/13/2016 08:18:26: Starting minibatch loop.
-05/13/2016 08:18:35: Finished Epoch[ 1 of 5]: [Training] CE = 2.31451355 * 100; Err = 0.87000000 * 100; totalSamplesSeen = 100; learningRatePerSample = 0.00046874999; epochTime=9.33323s
-05/13/2016 08:18:36: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv.1'
-
-05/13/2016 08:18:37: Starting Epoch 2: learning rate per sample = 0.000469  effective momentum = 0.000000  momentum as time constant = 0.0 samples
-
-05/13/2016 08:18:37: Starting minibatch loop.
-05/13/2016 08:18:37: Finished Epoch[ 2 of 5]: [Training] CE = 2.27380722 * 100; Err = 0.82000000 * 100; totalSamplesSeen = 200; learningRatePerSample = 0.00046874999; epochTime=0.020597s
-05/13/2016 08:18:37: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv.2'
-
-05/13/2016 08:18:37: Starting Epoch 3: learning rate per sample = 0.000469  effective momentum = 0.000000  momentum as time constant = 0.0 samples
-
-05/13/2016 08:18:37: Starting minibatch loop.
-05/13/2016 08:18:37: Finished Epoch[ 3 of 5]: [Training] CE = 2.25248398 * 100; Err = 0.83000000 * 100; totalSamplesSeen = 300; learningRatePerSample = 0.00046874999; epochTime=0.020236s
-05/13/2016 08:18:37: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv.3'
-
-05/13/2016 08:18:37: Starting Epoch 4: learning rate per sample = 0.000469  effective momentum = 0.000000  momentum as time constant = 0.0 samples
-
-05/13/2016 08:18:37: Starting minibatch loop.
-05/13/2016 08:18:37: Finished Epoch[ 4 of 5]: [Training] CE = 2.15781601 * 100; Err = 0.77000000 * 100; totalSamplesSeen = 400; learningRatePerSample = 0.00046874999; epochTime=0.020351s
-05/13/2016 08:18:37: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv.4'
-
-05/13/2016 08:18:37: Starting Epoch 5: learning rate per sample = 0.000469  effective momentum = 0.000000  momentum as time constant = 0.0 samples
-
-05/13/2016 08:18:37: Starting minibatch loop.
-05/13/2016 08:18:37: Finished Epoch[ 5 of 5]: [Training] CE = 2.12939789 * 100; Err = 0.71000000 * 100; totalSamplesSeen = 500; learningRatePerSample = 0.00046874999; epochTime=0.02018s
-05/13/2016 08:18:37: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20160513081543.861015\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv'
-05/13/2016 08:18:37: CNTKCommandTrainEnd: Train
-
-05/13/2016 08:18:37: Action "train" complete.
-
-
-05/13/2016 08:18:37: ##############################################################################
-05/13/2016 08:18:37: #                                                                            #
-05/13/2016 08:18:37: # Action "test"                                                              #
-05/13/2016 08:18:37: #                                                                            #
-05/13/2016 08:18:37: ##############################################################################
-
+Memory Sharing: Out of 77 matrices, 38 are shared as 16, and 39 are not shared.
+
+	{ conv1.c.c.y : [32 x 32 x 32 x *] (gradient)
+	  pool1 : [15 x 15 x 32 x *] }
+	{ conv2.c.c.sc : [32 x 1] (gradient)
+	  conv2.y : [15 x 15 x 32 x *] (gradient) }
+	{ conv1.c.W : [32 x 75] (gradient)
+	  conv2.c.c.c : [15 x 15 x 32 x *] }
+	{ conv2.c.c.b : [32 x 1] (gradient)
+	  conv3.c.c.c : [7 x 7 x 64 x *] (gradient)
+	  conv3.y : [7 x 7 x 64 x *] }
+	{ conv3.c.c.y : [7 x 7 x 64 x *] (gradient)
+	  pool3 : [3 x 3 x 64 x *] }
+	{ conv2.c.c.y : [15 x 15 x 32 x *] (gradient)
+	  pool2 : [7 x 7 x 32 x *] }
+	{ conv1.c.c.b : [32 x 1] (gradient)
+	  conv2.c.c.c : [15 x 15 x 32 x *] (gradient)
+	  conv2.y : [15 x 15 x 32 x *] }
+	{ conv1.c.c.c : [32 x 32 x 32 x *] (gradient)
+	  conv1.y : [32 x 32 x 32 x *] }
+	{ conv2.c.W : [32 x 800] (gradient)
+	  conv3.c.c.c : [7 x 7 x 64 x *] }
+	{ conv3.c.c.sc : [64 x 1] (gradient)
+	  conv3.y : [7 x 7 x 64 x *] (gradient)
+	  h1.t : [64 x *] }
+	{ conv1.c.c.sc : [32 x 1] (gradient)
+	  conv1.y : [32 x 32 x 32 x *] (gradient) }
+	{ OutputNodes.t : [10 x *]
+	  h1.bn : [64 x *] (gradient) }
+	{ conv3.c.W : [64 x 800] (gradient)
+	  h1.t : [64 x *] (gradient)
+	  h1.y : [64 x *] }
+	{ OutputNodes.W : [10 x 64] (gradient)
+	  OutputNodes.z : [10 x *] (gradient) }
+	{ OutputNodes.t : [10 x *] (gradient)
+	  pool1 : [15 x 15 x 32 x *] (gradient)
+	  pool2 : [7 x 7 x 32 x *] (gradient)
+	  pool3 : [3 x 3 x 64 x *] (gradient) }
+	{ h1.sc : [64 x 1] (gradient)
+	  h1.y : [64 x *] (gradient) }
+
+
+08/22/2016 16:48:07: Training 117098 parameters in 14 out of 14 parameter tensors and 32 nodes with gradient:
+
+08/22/2016 16:48:07: 	Node 'OutputNodes.W' (LearnableParameter operation) : [10 x 64]
+08/22/2016 16:48:07: 	Node 'OutputNodes.b' (LearnableParameter operation) : [10]
+08/22/2016 16:48:07: 	Node 'conv1.c.W' (LearnableParameter operation) : [32 x 75]
+08/22/2016 16:48:07: 	Node 'conv1.c.c.b' (LearnableParameter operation) : [32 x 1]
+08/22/2016 16:48:07: 	Node 'conv1.c.c.sc' (LearnableParameter operation) : [32 x 1]
+08/22/2016 16:48:07: 	Node 'conv2.c.W' (LearnableParameter operation) : [32 x 800]
+08/22/2016 16:48:07: 	Node 'conv2.c.c.b' (LearnableParameter operation) : [32 x 1]
+08/22/2016 16:48:07: 	Node 'conv2.c.c.sc' (LearnableParameter operation) : [32 x 1]
+08/22/2016 16:48:07: 	Node 'conv3.c.W' (LearnableParameter operation) : [64 x 800]
+08/22/2016 16:48:07: 	Node 'conv3.c.c.b' (LearnableParameter operation) : [64 x 1]
+08/22/2016 16:48:07: 	Node 'conv3.c.c.sc' (LearnableParameter operation) : [64 x 1]
+08/22/2016 16:48:07: 	Node 'h1.W' (LearnableParameter operation) : [64 x 3 x 3 x 64]
+08/22/2016 16:48:07: 	Node 'h1.b' (LearnableParameter operation) : [64 x 1]
+08/22/2016 16:48:07: 	Node 'h1.sc' (LearnableParameter operation) : [64 x 1]
+
+08/22/2016 16:48:07: No PreCompute nodes found, or all already computed. Skipping pre-computation step.
+
+08/22/2016 16:48:07: Starting Epoch 1: learning rate per sample = 0.000469  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+BlockRandomizer::StartEpoch: epoch 0: frames [0..100] (first sequence at sample 0), data subset 0 of 1
+
+08/22/2016 16:48:07: Starting minibatch loop.
+08/22/2016 16:48:11: Finished Epoch[ 1 of 5]: [Training] CE = 2.31451340 * 100; Err = 0.87000000 * 100; totalSamplesSeen = 100; learningRatePerSample = 0.00046874999; epochTime=4.17383s
+08/22/2016 16:48:11: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv.1'
+
+08/22/2016 16:48:11: Starting Epoch 2: learning rate per sample = 0.000469  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+BlockRandomizer::StartEpoch: epoch 1: frames [100..200] (first sequence at sample 100), data subset 0 of 1
+
+08/22/2016 16:48:11: Starting minibatch loop.
+08/22/2016 16:48:11: Finished Epoch[ 2 of 5]: [Training] CE = 2.27382584 * 100; Err = 0.82000000 * 100; totalSamplesSeen = 200; learningRatePerSample = 0.00046874999; epochTime=0.06179s
+08/22/2016 16:48:11: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv.2'
+
+08/22/2016 16:48:11: Starting Epoch 3: learning rate per sample = 0.000469  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+BlockRandomizer::StartEpoch: epoch 2: frames [200..300] (first sequence at sample 200), data subset 0 of 1
+
+08/22/2016 16:48:11: Starting minibatch loop.
+08/22/2016 16:48:11: Finished Epoch[ 3 of 5]: [Training] CE = 2.25143677 * 100; Err = 0.84000000 * 100; totalSamplesSeen = 300; learningRatePerSample = 0.00046874999; epochTime=0.059625s
+08/22/2016 16:48:11: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv.3'
+
+08/22/2016 16:48:11: Starting Epoch 4: learning rate per sample = 0.000469  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+BlockRandomizer::StartEpoch: epoch 3: frames [300..400] (first sequence at sample 300), data subset 0 of 1
+
+08/22/2016 16:48:11: Starting minibatch loop.
+08/22/2016 16:48:11: Finished Epoch[ 4 of 5]: [Training] CE = 2.15970703 * 100; Err = 0.77000000 * 100; totalSamplesSeen = 400; learningRatePerSample = 0.00046874999; epochTime=0.06332s
+08/22/2016 16:48:11: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv.4'
+
+08/22/2016 16:48:11: Starting Epoch 5: learning rate per sample = 0.000469  effective momentum = 0.000000  momentum as time constant = 0.0 samples
+BlockRandomizer::StartEpoch: epoch 4: frames [400..500] (first sequence at sample 400), data subset 0 of 1
+
+08/22/2016 16:48:11: Starting minibatch loop.
+08/22/2016 16:48:11: Finished Epoch[ 5 of 5]: [Training] CE = 2.13000214 * 100; Err = 0.69000000 * 100; totalSamplesSeen = 500; learningRatePerSample = 0.00046874999; epochTime=0.062707s
+08/22/2016 16:48:11: SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160822174803.255981\Examples\Image\Miscellaneous\CIFAR-10_02_BatchNormConv@release_gpu/Models/02_BatchNormConv'
+08/22/2016 16:48:11: CNTKCommandTrainEnd: Train
+
+08/22/2016 16:48:11: Action "train" complete.
+
+
+08/22/2016 16:48:11: ##############################################################################
+08/22/2016 16:48:11: #                                                                            #
+08/22/2016 16:48:11: # Action "test"                                                              #
+08/22/2016 16:48:11: #                                                                            #
+08/22/2016 16:48:11: ##############################################################################
+
+INFO: conv1.c.c.y: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model
+INFO: conv2.c.c.y: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model
+INFO: conv3.c.c.y: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model
+INFO: h1.bn: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model
 
 Post-processing network...
 
@@ -507,32 +572,32 @@ Validating --> conv1.c.c.c = Convolution (conv1.c.W, featScaled) : [32 x 75], [3
 Validating --> conv1.c.c.sc = LearnableParameter() :  -> [32 x 1]
 Validating --> conv1.c.c.b = LearnableParameter() :  -> [32 x 1]
 Validating --> conv1.c.c.m = LearnableParameter() :  -> [32 x 1]
-Validating --> conv1.c.c.isd = LearnableParameter() :  -> [32 x 1]
-Validating --> conv1.c.c.y = BatchNormalization (conv1.c.c.c, conv1.c.c.sc, conv1.c.c.b, conv1.c.c.m, conv1.c.c.isd) : [32 x 32 x 32 x *1], [32 x 1], [32 x 1], [32 x 1], [32 x 1] -> [32 x 32 x 32 x *1]
+Validating --> conv1.c.c.var = LearnableParameter() :  -> [32 x 1]
+Validating --> conv1.c.c.y = BatchNormalization (conv1.c.c.c, conv1.c.c.sc, conv1.c.c.b, conv1.c.c.m, conv1.c.c.var) : [32 x 32 x 32 x *1], [32 x 1], [32 x 1], [32 x 1], [32 x 1] -> [32 x 32 x 32 x *1]
 Validating --> conv1.y = RectifiedLinear (conv1.c.c.y) : [32 x 32 x 32 x *1] -> [32 x 32 x 32 x *1]
 Validating --> pool1 = MaxPooling (conv1.y) : [32 x 32 x 32 x *1] -> [15 x 15 x 32 x *1]
 Validating --> conv2.c.c.c = Convolution (conv2.c.W, pool1) : [32 x 800], [15 x 15 x 32 x *1] -> [15 x 15 x 32 x *1]
 Validating --> conv2.c.c.sc = LearnableParameter() :  -> [32 x 1]
 Validating --> conv2.c.c.b = LearnableParameter() :  -> [32 x 1]
 Validating --> conv2.c.c.m = LearnableParameter() :  -> [32 x 1]
-Validating --> conv2.c.c.isd = LearnableParameter() :  -> [32 x 1]
-Validating --> conv2.c.c.y = BatchNormalization (conv2.c.c.c, conv2.c.c.sc, conv2.c.c.b, conv2.c.c.m, conv2.c.c.isd) : [15 x 15 x 32 x *1], [32 x 1], [32 x 1], [32 x 1], [32 x 1] -> [15 x 15 x 32 x *1]
+Validating --> conv2.c.c.var = LearnableParameter() :  -> [32 x 1]
+Validating --> conv2.c.c.y = BatchNormalization (conv2.c.c.c, conv2.c.c.sc, conv2.c.c.b, conv2.c.c.m, conv2.c.c.var) : [15 x 15 x 32 x *1], [32 x 1], [32 x 1], [32 x 1], [32 x 1] -> [15 x 15 x 32 x *1]
 Validating --> conv2.y = RectifiedLinear (conv2.c.c.y) : [15 x 15 x 32 x *1] -> [15 x 15 x 32 x *1]
 Validating --> pool2 = MaxPooling (conv2.y) : [15 x 15 x 32 x *1] -> [7 x 7 x 32 x *1]
 Validating --> conv3.c.c.c = Convolution (conv3.c.W, pool2) : [64 x 800], [7 x 7 x 32 x *1] -> [7 x 7 x 64 x *1]
 Validating --> conv3.c.c.sc = LearnableParameter() :  -> [64 x 1]
 Validating --> conv3.c.c.b = LearnableParameter() :  -> [64 x 1]
 Validating --> conv3.c.c.m = LearnableParameter() :  -> [64 x 1]
-Validating --> conv3.c.c.isd = LearnableParameter() :  -> [64 x 1]
-Validating --> conv3.c.c.y = BatchNormalization (conv3.c.c.c, conv3.c.c.sc, conv3.c.c.b, conv3.c.c.m, conv3.c.c.isd) : [7 x 7 x 64 x *1], [64 x 1], [64 x 1], [64 x 1], [64 x 1] -> [7 x 7 x 64 x *1]
+Validating --> conv3.c.c.var = LearnableParameter() :  -> [64 x 1]
+Validating --> conv3.c.c.y = BatchNormalization (conv3.c.c.c, conv3.c.c.sc, conv3.c.c.b, conv3.c.c.m, conv3.c.c.var) : [7 x 7 x 64 x *1], [64 x 1], [64 x 1], [64 x 1], [64 x 1] -> [7 x 7 x 64 x *1]
 Validating --> conv3.y = RectifiedLinear (conv3.c.c.y) : [7 x 7 x 64 x *1] -> [7 x 7 x 64 x *1]
 Validating --> pool3 = MaxPooling (conv3.y) : [7 x 7 x 64 x *1] -> [3 x 3 x 64 x *1]
 Validating --> h1.t = Times (h1.W, pool3) : [64 x 3 x 3 x 64], [3 x 3 x 64 x *1] -> [64 x *1]
 Validating --> h1.sc = LearnableParameter() :  -> [64 x 1]
 Validating --> h1.b = LearnableParameter() :  -> [64 x 1]
 Validating --> h1.m = LearnableParameter() :  -> [64 x 1]
-Validating --> h1.isd = LearnableParameter() :  -> [64 x 1]
-Validating --> h1.bn = BatchNormalization (h1.t, h1.sc, h1.b, h1.m, h1.isd) : [64 x *1], [64 x 1], [64 x 1], [64 x 1], [64 x 1] -> [64 x *1]
+Validating --> h1.var = LearnableParameter() :  -> [64 x 1]
+Validating --> h1.bn = BatchNormalization (h1.t, h1.sc, h1.b, h1.m, h1.var) : [64 x *1], [64 x 1], [64 x 1], [64 x 1], [64 x 1] -> [64 x *1]
 Validating --> h1.y = RectifiedLinear (h1.bn) : [64 x *1] -> [64 x *1]
 Validating --> OutputNodes.t = Times (OutputNodes.W, h1.y) : [10 x 64], [64 x *1] -> [10 x *1]
 Validating --> OutputNodes.b = LearnableParameter() :  -> [10]
@@ -546,23 +611,23 @@ Validating network. 20 nodes to process in pass 2.
 Validating network, final pass.
 
 
-Using cuDNN convolution engine for geometry: Input: 32 x 32 x 3, Output: 32 x 32 x 32, Kernel: 5 x 5 x 3, Map: 1 x 1 x 32, Stride: 1 x 1 x 3, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv1.c.c.c: using cuDNN convolution engine for geometry: Input: 32 x 32 x 3, Output: 32 x 32 x 32, Kernel: 5 x 5 x 3, Map: 1 x 1 x 32, Stride: 1 x 1 x 3, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
 
 Using CNTK batch normalization engine.
 
-Using cuDNN convolution engine for geometry: Input: 32 x 32 x 32, Output: 15 x 15 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool1: using cuDNN convolution engine for geometry: Input: 32 x 32 x 32, Output: 15 x 15 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
 
-Using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 15 x 15 x 32, Kernel: 5 x 5 x 32, Map: 1 x 1 x 32, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv2.c.c.c: using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 15 x 15 x 32, Kernel: 5 x 5 x 32, Map: 1 x 1 x 32, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
 
 Using CNTK batch normalization engine.
 
-Using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 7 x 7 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool2: using cuDNN convolution engine for geometry: Input: 15 x 15 x 32, Output: 7 x 7 x 32, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
 
-Using cuDNN convolution engine for geometry: Input: 7 x 7 x 32, Output: 7 x 7 x 64, Kernel: 5 x 5 x 32, Map: 1 x 1 x 64, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
+conv3.c.c.c: using cuDNN convolution engine for geometry: Input: 7 x 7 x 32, Output: 7 x 7 x 64, Kernel: 5 x 5 x 32, Map: 1 x 1 x 64, Stride: 1 x 1 x 32, Sharing: (1), AutoPad: (1), LowerPad: 0, UpperPad: 0.
 
 Using CNTK batch normalization engine.
 
-Using cuDNN convolution engine for geometry: Input: 7 x 7 x 64, Output: 3 x 3 x 64, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
+pool3: using cuDNN convolution engine for geometry: Input: 7 x 7 x 64, Output: 3 x 3 x 64, Kernel: 3 x 3 x 1, Map: 1, Stride: 2 x 2 x 1, Sharing: (1), AutoPad: (0), LowerPad: 0, UpperPad: 0.
 
 Using CNTK batch normalization engine.
 
@@ -576,57 +641,14 @@ evalNodeNames are not specified, using all the default evalnodes and training cr
 
 Allocating matrices for forward and/or backward propagation.
 
-Memory Sharing Structure:
-
-0000000000000000: {[CE Gradient[1]] [Err Gradient[1]] [OutputNodes.W Gradient[10 x 64]] [OutputNodes.b Gradient[10]] [OutputNodes.t Gradient[10 x *1]] [OutputNodes.z Gradient[10 x *1]] [conv1.c.W Gradient[32 x 75]] [conv1.c.c.b Gradient[32 x 1]] [conv1.c.c.c Gradient[32 x 32 x 32 x *1]] [conv1.c.c.isd Gradient[32 x 1]] [conv1.c.c.m Gradient[32 x 1]] [conv1.c.c.sc Gradient[32 x 1]] [conv1.c.c.y Gradient[32 x 32 x 32 x *1]] [conv1.y Gradient[32 x 32 x 32 x *1]] [conv2.c.W Gradient[32 x 800]] [conv2.c.c.b Gradient[32 x 1]] [conv2.c.c.c Gradient[15 x 15 x 32 x *1]] [conv2.c.c.isd Gradient[32 x 1]] [conv2.c.c.m Gradient[32 x 1]] [conv2.c.c.sc Gradient[32 x 1]] [conv2.c.c.y Gradient[15 x 15 x 32 x *1]] [conv2.y Gradient[15 x 15 x 32 x *1]] [conv3.c.W Gradient[64 x 800]] [conv3.c.c.b Gradient[64 x 1]] [conv3.c.c.c Gradient[7 x 7 x 64 x *1]] [conv3.c.c.isd Gradient[64 x 1]] [conv3.c.c.m Gradient[64 x 1]] [conv3.c.c.sc Gradient[64 x 1]] [conv3.c.c.y Gradient[7 x 7 x 64 x *1]] [conv3.y Gradient[7 x 7 x 64 x *1]] [featOffs Gradient[1 x 1]] [featScaled Gradient[32 x 32 x 3 x *1]] [features Gradient[32 x 32 x 3 x *1]] [h1.W Gradient[64 x 3 x 3 x 64]] [h1.b Gradient[64 x 1]] [h1.bn Gradient[64 x *1]] [h1.isd Gradient[64 x 1]] [h1.m Gradient[64 x 1]] [h1.sc Gradient[64 x 1]] [h1.t Gradient[64 x *1]] [h1.y Gradient[64 x *1]] [labels Gradient[10 x *1]] [pool1 Gradient[15 x 15 x 32 x *1]] [pool2 Gradient[7 x 7 x 32 x *1]] [pool3 Gradient[3 x 3 x 64 x *1]] }
-000000E8A05681E0: {[conv2.c.c.c Value[15 x 15 x 32 x *1]] }
-000000E8A0568460: {[conv1.c.c.c Value[32 x 32 x 32 x *1]] }
-000000E8A05685A0: {[conv1.c.c.y Value[32 x 32 x 32 x *1]] }
-000000E8A0568A00: {[CE Value[1]] }
-000000E8A0568AA0: {[conv2.y Value[15 x 15 x 32 x *1]] }
-000000E8A0568B40: {[h1.y Value[64 x *1]] }
-000000E8A0568D20: {[featScaled Value[32 x 32 x 3 x *1]] }
-000000E8A0568DC0: {[pool1 Value[15 x 15 x 32 x *1]] }
-000000E8A05694A0: {[conv1.y Value[32 x 32 x 32 x *1]] }
-000000E8A0569540: {[pool3 Value[3 x 3 x 64 x *1]] }
-000000E8A0569680: {[OutputNodes.t Value[10 x *1]] }
-000000E8A0569720: {[OutputNodes.z Value[10 x *1]] }
-000000E8A05697C0: {[conv3.y Value[7 x 7 x 64 x *1]] }
-000000E8A05699A0: {[conv3.c.c.y Value[7 x 7 x 64 x *1]] }
-000000E8A0569D60: {[conv2.c.c.y Value[15 x 15 x 32 x *1]] }
-000000E8A0569E00: {[h1.t Value[64 x *1]] }
-000000E8A0569F40: {[conv3.c.c.c Value[7 x 7 x 64 x *1]] }
-000000E8A056A080: {[Err Value[1]] }
-000000E8A056A3A0: {[pool2 Value[7 x 7 x 32 x *1]] }
-000000E8A056A620: {[h1.bn Value[64 x *1]] }
-000000E8A16A32D0: {[h1.sc Value[64 x 1]] }
-000000E8A16A3870: {[conv2.c.c.b Value[32 x 1]] }
-000000E8A16A3C30: {[conv1.c.c.isd Value[32 x 1]] }
-000000E8A16A3CD0: {[conv2.c.c.sc Value[32 x 1]] }
-000000E8A16A3E10: {[conv3.c.c.b Value[64 x 1]] }
-000000E8A16A3F50: {[conv1.c.c.b Value[32 x 1]] }
-000000E8A16A4090: {[conv2.c.c.isd Value[32 x 1]] }
-000000E8A16A4310: {[conv3.c.c.sc Value[64 x 1]] }
-000000E8A16A4630: {[conv1.c.c.sc Value[32 x 1]] }
-000000E8A16A46D0: {[conv1.c.W Value[32 x 75]] }
-000000E8A16A4A90: {[conv1.c.c.m Value[32 x 1]] }
-000000E8A16A4B30: {[conv3.c.W Value[64 x 800]] }
-000000E8A16A4EF0: {[conv2.c.W Value[32 x 800]] }
-000000E8A16A4F90: {[conv3.c.c.m Value[64 x 1]] }
-000000E8A16A5030: {[featOffs Value[1 x 1]] }
-000000E8A16A50D0: {[conv3.c.c.isd Value[64 x 1]] }
-000000E8A16A5350: {[conv2.c.c.m Value[32 x 1]] }
-000000E8A16A53F0: {[features Value[32 x 32 x 3 x *1]] }
-000000E8A16A5530: {[h1.b Value[64 x 1]] }
-000000E8A16A57B0: {[h1.isd Value[64 x 1]] }
-000000E8A16A58F0: {[h1.m Value[64 x 1]] }
-000000E8A16A5CB0: {[labels Value[10 x *1]] }
-000000E8A16A6110: {[OutputNodes.W Value[10 x 64]] }
-000000E8A16A61B0: {[OutputNodes.b Value[10]] }
-000000E8A16A6930: {[h1.W Value[64 x 3 x 3 x 64]] }
-
-05/13/2016 08:18:52: Final Results: Minibatch[1-625]: Err = 0.84580000 * 10000; CE = 2.27296712 * 10000; perplexity = 9.70816338
-
-05/13/2016 08:18:52: Action "test" complete.
-
-05/13/2016 08:18:52: __COMPLETED__
\ No newline at end of file
+Memory Sharing: Out of 45 matrices, 0 are shared as 0, and 45 are not shared.
+
+
+BlockRandomizer::StartEpoch: epoch 0: frames [0..10000] (first sequence at sample 0), data subset 0 of 1
+08/22/2016 16:48:14: Minibatch[1-500]: Err = 0.82550000 * 8000; CE = 4.16613538 * 8000
+08/22/2016 16:48:15: Minibatch[501-625]: Err = 0.82050000 * 2000; CE = 4.21907035 * 2000
+08/22/2016 16:48:15: Final Results: Minibatch[1-625]: Err = 0.82450000 * 10000; CE = 4.17672238 * 10000; perplexity = 65.15195926
+
+08/22/2016 16:48:15: Action "test" complete.
+
+08/22/2016 16:48:15: __COMPLETED__
\ No newline at end of file
diff --git a/Tests/UnitTests/EvalTests/EvalTests.vcxproj b/Tests/UnitTests/EvalTests/EvalTests.vcxproj
index f2c6b2286d54..270076518349 100644
--- a/Tests/UnitTests/EvalTests/EvalTests.vcxproj
+++ b/Tests/UnitTests/EvalTests/EvalTests.vcxproj
@@ -155,7 +155,7 @@
   </Target>
   <Target Name="CopyUnitTestDependencies" AfterTargets="Build">
     <PropertyGroup>
-      <CuDnnDll Condition="$(GpuBuild) And Exists('$(OutDir)..\cudnn64_4.dll')">$(OutDir)..\cudnn64_4.dll</CuDnnDll>
+      <CuDnnDll Condition="$(GpuBuild) And Exists('$(OutDir)..\cudnn64_5.dll')">$(OutDir)..\cudnn64_5.dll</CuDnnDll>
     </PropertyGroup>
     <ItemGroup>
       <UnitTestDependencies Include="$(OutDir)CNTK.Core.BS;$(OutDir)..\evaldll.dll;$(OutDir)..\Math.dll;$(UnitTestDlls)" />
diff --git a/Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp b/Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp
index 8b38a7ab8d06..ae5d2b9925d1 100644
--- a/Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp
+++ b/Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp
@@ -61,7 +61,7 @@ std::vector<std::tuple<TensorShape, size_t, bool, double, double>> GenerateBNTes
         res.push_back(std::make_tuple(TensorShape(2, 2, 2048), 64, true, expAvgFactor, blendFactor));
     }
 
-    // Test running mean/isd.
+    // Test running mean/variance.
     expAvgFactor = 0.1;
     res.push_back(std::make_tuple(TensorShape(2, 2, 2), 8, false, expAvgFactor, 0));
     res.push_back(std::make_tuple(TensorShape(2, 2, 2), 8, true, expAvgFactor, 0));
@@ -75,6 +75,7 @@ BOOST_AUTO_TEST_SUITE(BatchNormalizationSuite)
 
 BOOST_AUTO_TEST_CASE(BatchNormalizationForward)
 {
+    // TODO tests for expAvgFactor 0?
     std::mt19937 rng(0);
     std::normal_distribution<float> nd;
 
@@ -186,7 +187,7 @@ BOOST_AUTO_TEST_CASE(BatchNormalizationForward)
 #ifndef _DEBUG
             float elapsedCntk = time1.Elapsed();
             float elapsedCudnn = time2.Elapsed();
-            // Check performance. Current version of cuDNN (v4 RC) is significanlty slower than CNTK implementation.
+            // Check performance. Current version of cuDNN (v4 RC) is significantly slower than CNTK implementation.
             // For optimal cases (vectorSize % 32 == 0 and batchSize % 32 == 0), CNTK implementation can be >5x faster than cuDNN.
             // Production version is about the same.
             if (crow >= 32 && ccol >= 32)
diff --git a/Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp b/Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp
index d3b6815145f9..2684e27833f0 100644
--- a/Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp
+++ b/Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp
@@ -224,7 +224,7 @@ BOOST_AUTO_TEST_CASE(ConvolutionForward)
             std::string emsg;
 
             BOOST_REQUIRE_MESSAGE(!out.HasNan("out"), "out" << msgNan);
-            BOOST_REQUIRE_MESSAGE(CheckEqual(out, outB, emsg, relErr * 4, absErr * 9), "out" << msg << ". " << emsg);
+            BOOST_REQUIRE_MESSAGE(CheckEqual(out, outB, emsg, relErr * 4, absErr * 14), "out" << msg << ". " << emsg);
             BOOST_REQUIRE_MESSAGE(CountNans(outBuf) == crowOut * 2 * n, "out" << msgNotNan);
         }
     }
diff --git a/Tests/UnitTests/MathTests/MathTests.vcxproj b/Tests/UnitTests/MathTests/MathTests.vcxproj
index 380623405ea6..8bb1a84c5ef0 100644
--- a/Tests/UnitTests/MathTests/MathTests.vcxproj
+++ b/Tests/UnitTests/MathTests/MathTests.vcxproj
@@ -170,7 +170,7 @@
   </Target>
   <Target Name="CopyUnitTestDependencies" AfterTargets="Build">
     <PropertyGroup>
-      <CuDnnDll Condition="$(GpuBuild) And Exists('$(OutDir)..\cudnn64_4.dll')">$(OutDir)..\cudnn64_4.dll</CuDnnDll>
+      <CuDnnDll Condition="$(GpuBuild) And Exists('$(OutDir)..\cudnn64_5.dll')">$(OutDir)..\cudnn64_5.dll</CuDnnDll>
     </PropertyGroup>
     <ItemGroup>
       <UnitTestDependencies Include="$(OutDir)..\Math.dll;$(UnitTestDlls);" />
diff --git a/Tests/UnitTests/V2LibraryTests/Image.h b/Tests/UnitTests/V2LibraryTests/Image.h
index 26fb29dbdc92..a2c164e1c066 100644
--- a/Tests/UnitTests/V2LibraryTests/Image.h
+++ b/Tests/UnitTests/V2LibraryTests/Image.h
@@ -28,12 +28,12 @@ inline FunctionPtr ProjLayer(Variable wProj, Variable input, size_t hStride, siz
     auto b = Parameter({ outFeatureMapCount }, (float)bValue, device);
     auto sc = Parameter({ outFeatureMapCount }, (float)scValue, device);
     auto m = Constant({ outFeatureMapCount }, 0.0f, device);
-    auto isd = Constant({ outFeatureMapCount }, 0.0f, device);
+    auto var = Constant({ outFeatureMapCount }, 0.0f, device);
 
     size_t numInputChannels = input.Shape()[input.Shape().NumAxes() - 1];
 
     auto c = Convolution(wProj, input, { hStride, vStride, numInputChannels }, { true }, { false });
-    return BatchNormalization(c, sc, b, m, isd, true /*spatial*/, (double)bnTimeConst);
+    return BatchNormalization(c, sc, b, m, var, true /*spatial*/, (double)bnTimeConst);
 }
 
 inline FunctionPtr ResNetNode2(Variable input, size_t outFeatureMapCount, size_t kernelWidth, size_t kernelHeight, double wScale, double bValue, double scValue, size_t bnTimeConst, const DeviceDescriptor& device)
diff --git a/Tools/docker/CNTK-GPU-Image/Dockerfile b/Tools/docker/CNTK-GPU-Image/Dockerfile
index 590d2e5bb30c..37afce586b99 100644
--- a/Tools/docker/CNTK-GPU-Image/Dockerfile
+++ b/Tools/docker/CNTK-GPU-Image/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:7.5-cudnn4-devel
+FROM nvidia/cuda:7.5-cudnn5-devel
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         autotools-dev \
diff --git a/configure b/configure
index b6101ebe3729..3cb8645691d6 100755
--- a/configure
+++ b/configure
@@ -91,7 +91,7 @@ default_kaldis="kaldi-trunk kaldi-c024e8aa"
 default_gdk_includes="include/nvidia/gdk"
 default_gdk_nvml_libs="src/gdk/nvml/lib"
 default_cubs="cub-1.4.1"
-default_cudnns="cudnn-4.0"
+default_cudnns="cudnn-5.0 cudnn-5.1"
 default_opencvs="opencv-3.1.0 opencv-3.0.0"
 default_libzips="libzip-1.1.2"