merged with master

AltasK · Apr 3, 2016 · 7784350 · 7784350
2 parents 531f553 + 916497b
commit 7784350
Show file tree

Hide file tree

Showing 385 changed files with 102,405 additions and 4,934,870 deletions.
diff --git a/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx b/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx
@@ -8333,9 +8333,9 @@ SquareError
 \begin_layout Standard
 \begin_inset Formula 
 \begin{eqnarray}
-v\left(\mathbf{X},\mathbf{\mathbf{Y}}\right) & \leftarrow & \frac{1}{2}\mathrm{Tr}\left(\left(\mathbf{X}-\mathbf{Y}\right)\left(\mathbf{X}-\mathbf{Y}\right)^{T}\right)\\
-\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right)\\
-\nabla_{\mathbf{\mathbf{Y}}}^{J} & \leftarrow & \nabla_{\mathbf{\mathbf{Y}}}^{J}-\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right).
+v\left(\mathbf{X},\mathbf{Y}\right) & \leftarrow & \mathrm{Tr}\left(\left(\mathbf{X}-\mathbf{Y}\right)\left(\mathbf{X}-\mathbf{Y}\right)^{T}\right)\\
+\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+2\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right)\\
+\nabla_{\mathbf{Y}}^{J} & \leftarrow & \nabla_{\mathbf{Y}}^{J}-2\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right).
 \end{eqnarray}
 
 \end_inset
@@ -8367,8 +8367,8 @@ Note that
 \color none
 \begin_inset Formula 
 \begin{eqnarray}
-\frac{\partial v}{\partial\mathbf{X}} & = & \mathbf{X}-\mathbf{Y}\\
-\frac{\partial v}{\partial\mathbf{Y}} & = & \mathbf{-\left(X-\mathbf{Y}\right)}.
+\frac{\partial v}{\partial\mathbf{X}} & = & +2\left(\mathbf{X}-\mathbf{Y}\right)\\
+\frac{\partial v}{\partial\mathbf{Y}} & = & -2\left(\mathbf{X}-\mathbf{Y}\right).
 \end{eqnarray}
 
 \end_inset

diff --git a/Examples/Image/MNIST/AdditionalFiles/mnist_convert_python3.py b/Examples/Image/MNIST/AdditionalFiles/mnist_convert_python3.py
@@ -1,4 +1,4 @@
-import urllib
+import urllib.request
 import gzip
 import os
 import struct

diff --git a/Examples/Image/MNIST/Config/01_OneHidden.cntk b/Examples/Image/MNIST/Config/01_OneHidden.cntk
@@ -13,7 +13,6 @@ deviceId = 0
 imageLayout = "cudnn"
 # override the above as follows when running on CPU:
 # deviceId = -1
-# imageLayout = "legacy"
 
 command = MNISTtrain:MNISTtest
 

diff --git a/Examples/Image/MNIST/Config/01_OneHidden.ndl b/Examples/Image/MNIST/Config/01_OneHidden.ndl
@@ -25,6 +25,7 @@ DNN = [
     err = ErrorPrediction(labels, ol)
 
     # Special Nodes
+    errTop5 = ErrorPrediction(labels, ol, Const(1), tag="eval")
     FeatureNodes = (features)
     LabelNodes = (labels)
     CriterionNodes = (ce)

diff --git a/Examples/Image/MNIST/Config/02_Convolution.cntk b/Examples/Image/MNIST/Config/02_Convolution.cntk
@@ -13,7 +13,6 @@ deviceId = 0
 imageLayout = "cudnn"
 # override the above as follows when running on CPU:
 # deviceId = -1
-# imageLayout = "legacy"
 
 command = train:test
 
@@ -42,7 +41,7 @@ train = [
     SGD = [
         epochSize = 60000
         minibatchSize = 32
-        learningRatesPerMB = 0.5
+        learningRatesPerMB = 0.1*5:0.3
         momentumPerMB = 0*10:0.7
         maxEpochs = 15
     ]

diff --git a/Examples/Image/MNIST/Config/02_Convolution.ndl b/Examples/Image/MNIST/Config/02_Convolution.ndl
@@ -23,36 +23,38 @@ DNN=[
     hStride1 = 1
     vStride1 = 1
     # weight[cMap1, kW1 * kH1 * inputChannels]
-    # ConvReLULayer is defined in Macros.ndl
-    conv1_act = ConvReLULayer(featScaled, cMap1, 25, kW1, kH1, hStride1, vStride1, 10, 1)
-
+    # Conv2DReLULayer is defined in Macros.ndl
+    conv1 = Conv2DReLULayer(featScaled, cMap1, 25, kW1, kH1, hStride1, vStride1, 10, 1)
+    
     # pool1
     pool1W = 2
     pool1H = 2
     pool1hStride = 2
     pool1vStride = 2
-    pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout=$imageLayout$)
-
+    # MaxPooling is a standard NDL node.
+    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout=$imageLayout$)
+
     # conv2
     kW2 = 5
     kH2 = 5
     cMap2 = 32
     hStride2 = 1
     vStride2 = 1
     # weight[cMap2, kW2 * kH2 * cMap1]
-    # ConvReLULayer is defined in Macros.ndl
-    conv2_act = ConvReLULayer(pool1, cMap2, 400, kW2, kH2, hStride2, vStride2, 10, 1)
-
+    # ConvNDReLULayer is defined in Macros.ndl
+    conv2 = ConvNDReLULayer(pool1, kW2, kH2, cMap1, 400, cMap2, hStride2, vStride2, 10, 1)
+ 
     # pool2
     pool2W = 2
     pool2H = 2
     pool2hStride = 2
     pool2vStride = 2
-    pool2 = MaxPooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout=$imageLayout$)
-
+    # MaxNDPooling is defined in Macros.ndl
+    pool2 = MaxNDPooling(conv2, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout=$imageLayout$)
+
     h1Dim = 128
     # DNNImageSigmoidLayer and DNNLayer are defined in Macros.ndl
-    h1 = DNNImageSigmoidLayer(4, 4, cMap2, h1Dim, pool2, 1)
+    h1 = DNNImageSigmoidLayer(7, 7, cMap2, h1Dim, pool2, 1)
     ol = DNNLayer(h1Dim, labelDim, h1, 1)
 
     ce = CrossEntropyWithSoftmax(labels, ol)

diff --git a/Examples/Image/MNIST/Config/03_ConvBatchNorm.cntk b/Examples/Image/MNIST/Config/03_ConvBatchNorm.cntk
@@ -13,9 +13,8 @@ deviceId = 0
 imageLayout = "cudnn"
 # override the above as follows when running on CPU:
 # deviceId = -1
-# imageLayout = "legacy"
 
-command = train:CreateEvalModel:test
+command = train:test
 
 precision = "float"
 modelPath = "$ModelDir$/03_ConvBatchNorm"
@@ -38,9 +37,11 @@ train = [
     SGD = [
         epochSize = 60000
         minibatchSize = 32
-        learningRatesPerMB = 0.5
-        momentumPerMB = 0*10:0.7
+        learningRatesPerMB = 0.5:0.1
+        momentumPerMB = 0.9
         maxEpochs = 2
+        #batchNormalizationTimeConstant=0 # Set through NDL
+        batchNormalizationBlendTimeConstant=0:1#INF
     ]
 
     reader = [
@@ -63,17 +64,6 @@ train = [
     ]    
 ]
 
-#######################################
-#  Edit model                         #
-#######################################
-
-CreateEvalModel=[    
-    action=edit
-    CurModel=$ModelDir$/03_ConvBatchNorm
-    NewModel=$ModelDir$/03_ConvBatchNorm.Eval
-    editPath=$ConfigDir$/03_ConvBatchNorm.mel
-]
-
 #######################################
 #  TEST CONFIG                        #
 #######################################
@@ -82,7 +72,7 @@ test = [
     action = "test"
     minibatchSize = 32
 
-    modelPath=$ModelDir$/03_ConvBatchNorm.Eval
+    modelPath=$ModelDir$/03_ConvBatchNorm
 
     NDLNetworkBuilder = [
         networkDescription = "$ConfigDir$/03_ConvBatchNorm.ndl"

diff --git a/Examples/Image/MNIST/Config/03_ConvBatchNorm.mel b/Examples/Image/MNIST/Config/03_ConvBatchNorm.mel
diff --git a/Examples/Image/MNIST/Config/03_ConvBatchNorm.ndl b/Examples/Image/MNIST/Config/03_ConvBatchNorm.ndl
@@ -15,7 +15,7 @@ ndlMnistMacros = [
     labels = InputValue(labelDim)
 
     scValue = 1
-    # Batch normalization time constant.
+    # Batch normalization time constant (normalizationTimeConstant). blendTimeConstant is set through .cntk file.
     bnTimeConst = 1024
 
     convWScale = 10

diff --git a/Examples/Image/MNIST/Config/Macros.ndl b/Examples/Image/MNIST/Config/Macros.ndl
@@ -1,28 +1,28 @@
 DNNSigmoidLayer(inDim, outDim, x, parmScale) = [
-    W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale) 
-    b = LearnableParameter(outDim, 1,     init="uniform", initValueScale=parmScale) 
+    W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale, initOnCPUOnly=true) 
+    b = LearnableParameter(outDim, 1,     init="uniform", initValueScale=parmScale, initOnCPUOnly=true) 
     t = Times(W, x)
     z = Plus(t, b)
     y = Sigmoid(z)
 ]
 
 DNNImageSigmoidLayer(inW, inH, inC, outDim, x, parmScale) = [
-    W = ImageParameter(outDim, inW, inH, inC, init="uniform", initValueScale=parmScale, imageLayout=$imageLayout$)
-    b = LearnableParameter(outDim, 1,         init="uniform", initValueScale=parmScale) 
+    W = ImageParameter(outDim, inW, inH, inC, init="uniform", initValueScale=parmScale, initOnCPUOnly=true, imageLayout=$imageLayout$)
+    b = LearnableParameter(outDim, 1,         init="uniform", initValueScale=parmScale, initOnCPUOnly=true) 
     t = Times(W, x)
     z = Plus(t, b)
     y = Sigmoid(z)
 ]
 
 DNNLayer(inDim, outDim, x, parmScale) = [
-    W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale)
-    b = LearnableParameter(outDim, 1,     init="uniform", initValueScale=parmScale)
+    W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale, initOnCPUOnly=true)
+    b = LearnableParameter(outDim, 1,     init="uniform", initValueScale=parmScale, initOnCPUOnly=true)
     t = Times(W, x)
     z = Plus(t, b)
 ]
 
 DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
-    W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
+    W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale, initOnCPUOnly=true) 
     b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
     sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
     m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
@@ -32,12 +32,36 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
     y = RectifiedLinear(bn)
 ]
 
-ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) = [
-    convW = LearnableParameter(outMap, inWCount, init="uniform", initValueScale=wScale)
-    convB = ImageParameter(1, 1, outMap, init="fixedValue", value=bValue, imageLayout=$imageLayout$)
-    conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding=false, imageLayout=$imageLayout$)
-    convPlusB = Plus(conv, convB);
-    act = RectifiedLinear(convPlusB);
+ConvW(outMap, inWCount, wScale) = [
+    W = LearnableParameter(outMap, inWCount, init="uniform", initValueScale=wScale, initOnCPUOnly=true)
+]
+
+ConvB(outMap, bValue) = [
+    b = ImageParameter(1, 1, outMap, init="fixedValue", value=bValue, imageLayout=$imageLayout$)
+]
+
+Conv2D(w, inp, kW, kH, outMap, hStride, vStride) = [
+    c = Convolution(w, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true, imageLayout=$imageLayout$)
+]
+
+ConvND(w, inp, kW, kH, inMap, outMap, hStride, vStride) = [
+    c = Convolution(w, inp, {kW, kH, inMap}, mapCount=outMap, stride={hStride, vStride, inMap}, sharing={true, true, true}, autoPadding={true, true, false}, lowerPad=0, upperPad=0, imageLayout=$imageLayout$)
+]
+
+Conv2DReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) = [
+    w = ConvW(outMap, inWCount, wScale)
+    b = ConvB(outMap, bValue)
+    c = Conv2D(w, inp, kW, kH, outMap, hStride, vStride)
+    cpb = Plus(c, b);
+    out = RectifiedLinear(cpb);
+]
+
+ConvNDReLULayer(inp, kW, kH, inMap, inWCount, outMap, hStride, vStride, wScale, bValue) = [
+    w = ConvW(outMap, inWCount, wScale)
+    b = ConvB(outMap, bValue)
+    c = ConvND(w, inp, kW, kH, inMap, outMap, hStride, vStride)
+    cpb = Plus(c, b);
+    out = RectifiedLinear(cpb);
 ]
 
 ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst) = [
@@ -51,11 +75,15 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
 ]
 
 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [
-    W = LearnableParameter(outMap, inWCount, init=Gaussian, initValueScale=wScale)
+    W = LearnableParameter(outMap, inWCount, init=Gaussian, initValueScale=wScale, initOnCPUOnly=true)
     c = ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
 ]
 
 ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [
     c = ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
     y = RectifiedLinear(c)
 ]
+
+MaxNDPooling(inp, kW, kH, hStride, vStride) = [
+		p = Pooling(inp, "max", {kW, kH, 1}, stride={hStride, vStride, 1}, autoPadding={true, true, false}, lowerPad=0, upperPad=0, imageLayout=$imageLayout$)
+]
diff --git a/Examples/Image/MNIST/README.md b/Examples/Image/MNIST/README.md
@@ -70,7 +70,7 @@ To run the sample, navigate to the Data folder and run the following command:
 
 3. 03_ConvBatchNorm.ndl is almost identical to 02_Convolution.ndl 
 except that it uses batch normalization for the convolutional and fully connected layers.
-As a result, it achieves around 0.92% of error after training for just 2 epochs (and less than 30 seconds).
+As a result, it achieves around 0.8% of error after training for just 2 epochs (and less than 30 seconds).
 To run the sample, navigate to the Data folder and run the following command:  
 `cntk configFile=../Config/03_ConvBatchNorm.cntk`
 

diff --git a/Examples/Image/Miscellaneous/CIFAR-10/01_Conv.cntk b/Examples/Image/Miscellaneous/CIFAR-10/01_Conv.cntk
@@ -12,7 +12,6 @@ deviceId = 0
 imageLayout = "cudnn"
 # override the above as follows when running on CPU:
 # deviceId = -1
-# imageLayout = "legacy"
 
 prefetch = "true"
 
@@ -45,6 +44,7 @@ Train = [
         readerType = "UCIFastReader"
         file = "$DataDir$/Train.txt"
         randomize = "auto"
+        minibatchMode="full"
         features = [
             dim = 3072
             start = 1

diff --git a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.cntk b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.cntk
@@ -12,11 +12,10 @@ deviceId = 0
 imageLayout = "cudnn"
 # override the above as follows when running on CPU:
 # deviceId = -1
-# imageLayout = "legacy"
 
 prefetch = "true"
 
-command = Train:AddBNEval:Test
+command = Train:Test
 
 stderr = "$OutputDir$/02_BatchNormConv"
 traceLevel = 1
@@ -44,6 +43,7 @@ Train = [
         readerType = "UCIFastReader"
         file = "$DataDir$/Train.txt"
         randomize = "auto"
+        minibatchMode="full"
         features = [
             dim = 3072
             start = 1
@@ -57,16 +57,9 @@ Train = [
     ]    
 ]
 
-AddBNEval = [    
-    action = "edit"
-    CurModel = "$ModelDir$/02_BatchNormConv"
-    NewModel = "$ModelDir$/02_BatchNormConv.Eval"
-    editPath = "$ConfigDir$/02_BatchNormConv.mel"
-]
-
 Test = [
     action = "test"
-    modelPath = "$ModelDir$/02_BatchNormConv.Eval"
+    modelPath = "$ModelDir$/02_BatchNormConv"
     # Set minibatch size for testing.
     minibatchSize = 16
 

diff --git a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.mel b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.mel
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.cntk b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.cntk
@@ -12,12 +12,11 @@ deviceId = 0
 imageLayout = "cudnn"
 # override the above as follows when running on CPU:
 # deviceId = -1
-# imageLayout = "legacy"
 
 prefetch = "true"
 parallelTrain = "false"
 
-command = Train:AddBNEval:Test
+command = Train:Test
 
 stderr = "$OutputDir$/03_ResNet"
 traceLevel = 1
@@ -75,16 +74,9 @@ Train = [
     ]    
 ]
 
-AddBNEval = [    
-    action = "edit"
-    CurModel = "$ModelDir$/03_ResNet"
-    NewModel = "$ModelDir$/03_ResNet.Eval"
-    editPath = "$ConfigDir$/03_ResNet.mel"
-]
-
 Test = [
     action = "test"
-    modelPath = "$ModelDir$/03_ResNet.Eval"
+    modelPath = "$ModelDir$/03_ResNet"
     # Set minibatch size for testing.
     minibatchSize = 512
 

diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel