From e389459ca65517b928e0a6eddbd3bd3f906780a5 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Wed, 20 Jan 2016 23:28:33 -0800
Subject: [PATCH] LearnableParameter::InitRandom() now mimics the Matrix
 initialization behavior; bug fix: LookupTableNode used GetAsMatrixNumRows()
 on input[1] which is a minibatch; bug fix: Image/QuickE2E network definition
 updated to drop the now unnecessary extra column dimension; bug fix:
 TensorShape::IsDense() should not require m_offset to be 0 (column slices are
 perfectly fine)

---
 Source/Common/Include/TensorShape.h             |  2 --
 Source/ComputationNetworkLib/ComputationNode.h  |  2 +-
 .../ComputationNetworkLib/InputAndParamNodes.h  | 17 +++++++++++------
 Tests/EndToEndTests/Image/QuickE2E/cntk.config  |  4 ++--
 Tests/EndToEndTests/Speech/LSTM/cntk.config     | 17 +++++++++--------
 5 files changed, 23 insertions(+), 19 deletions(-)
diff --git a/Source/Common/Include/TensorShape.h b/Source/Common/Include/TensorShape.h
index 8d7bc6330e38..3bd3119fce71 100644
--- a/Source/Common/Include/TensorShape.h
+++ b/Source/Common/Include/TensorShape.h
@@ -388,8 +388,6 @@ struct TensorShape
     // verify that this refers to a dense matrix (no strides)
     void VerifyIsDense() const
     {
-        if (m_offset != 0)
-            LogicError("TensorShape: A dense TensorShape expected. Offset %d not allowed.", (int) m_offset);
         for (size_t k = 0; k < m_dims.size(); k++) // (TODO: we can save one multiplication here)
         {
             ptrdiff_t stride = k > 0 ? m_strides[k - 1] * (ptrdiff_t) m_dims[k - 1] : 1;
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index e54c0f34e635..8d4ee3bbba8e 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -393,7 +393,7 @@ class ComputationNodeBase : public IComputationNode,
             if (HasMBLayout())
                 LogicError("CheckTensorIsMatrix: Minibatch data cannot be interpreted as a single 2D tensor.");
             else if (m_sampleLayout.GetRank() < 1 || m_sampleLayout.GetRank() > 2)  // note: scalars are not stored as tensors of rank 0, but rather as 1-dim vectors. TODO: clean this up some day
-                LogicError("CheckTensorIsMatrix: Sample is now a 2D tensor.");
+                LogicError("CheckTensorIsMatrix: Sample is not a column vector or matrix (1D or 2D tensor).");
         }
     public:
         size_t GetAsMatrixNumRows() const
diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index 8643769011dc..ae1c94783a1b 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -130,21 +130,26 @@ class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
 
         // the random seed offset is set via the "randomSeedOffset" parameter in config
         if (initOnCPUOnly)
-            m_value->TransferToDeviceIfNotThereAndNotAutoPlace(CPUDEVICE, true);
+            Value().TransferToDeviceIfNotThereAndNotAutoPlace(CPUDEVICE, true);
+#if 1   // this more complex version is needed to repro test cases generated with an older version
+        auto value = GetSampleLayout().GetRank() > 2 ? Value() : ValueAsMatrix();
+#else
+        auto value = Value();
+#endif
         if (uniformInit)
         {
             // TODO: move these hidden extra factors out from here and into NDL, and make them visible in BS
             ElemType randRange = 0.05f * initValueScale;
-            Value().SetUniformRandomValue(-randRange, randRange, randomSeed);
+            value.SetUniformRandomValue(-randRange, randRange, randomSeed);
         }
         else
         {
             size_t inputSize = GetAsMatrixNumCols();
             ElemType randInitstd = 0.2f * initValueScale / sqrt(ElemType(inputSize));
-            Value().SetGaussianRandomValue(0, randInitstd, randomSeed);
+            value.SetGaussianRandomValue(0, randInitstd, randomSeed);
         }
         if (initOnCPUOnly)
-            m_value->TransferToDeviceIfNotThereAndNotAutoPlace(m_deviceId, true);
+            Value().TransferToDeviceIfNotThereAndNotAutoPlace(m_deviceId, true);
     }
 
     // initialize by reading a matrix from a text file
@@ -492,10 +497,10 @@ class LookupTableNode : public ComputationNode<ElemType>, public NumInputs<2>
 
         if (isFinalValidationPass && !HasMBLayout())
             InvalidArgument("%ls %ls operation can only operate on minibatches.", NodeName().c_str(), OperationName().c_str());
-        if (isFinalValidationPass && Input(1)->GetAsMatrixNumRows() % Input(0)->GetAsMatrixNumCols() != 0)
+        if (isFinalValidationPass && Input(1)->GetSampleMatrixNumRows() % Input(0)->GetAsMatrixNumCols() != 0)
             InvalidArgument("Mismatched dimension. Rows in input1 must be multiples of cols in input0.");
 
-        int wordsInEachSample = Input(1)->GetAsMatrixNumRows() / Input(0)->GetAsMatrixNumCols();
+        size_t wordsInEachSample = Input(1)->GetSampleMatrixNumRows() / Input(0)->GetAsMatrixNumCols();
 
         // TODO: Should this add a tensor dimension?
         SetDims(TensorShape(Input(0)->GetSampleMatrixNumRows() * wordsInEachSample), true);
diff --git a/Tests/EndToEndTests/Image/QuickE2E/cntk.config b/Tests/EndToEndTests/Image/QuickE2E/cntk.config
index eb49229c9be8..470b895a1585 100644
--- a/Tests/EndToEndTests/Image/QuickE2E/cntk.config
+++ b/Tests/EndToEndTests/Image/QuickE2E/cntk.config
@@ -33,8 +33,8 @@ train = [
             convW = Parameter(outMap, inWCount, init="uniform", initValueScale=wScale, initOnCPUOnly=false)
             conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding=false, imageLayout=if useCuDnn then "cudnn" else "legacy")
             convB = if useCuDnn
-                    then ParameterTensor((1 : 1 : outMap : 1/*col dim*/), init="fixedValue", value=bValue)
-                    else Parameter(outMap, 1,                             init="fixedValue", value=bValue)
+                    then ParameterTensor((1 : 1 : outMap), init="fixedValue", value=bValue)
+                    else Parameter(outMap, 1,              init="fixedValue", value=bValue)
             convPlusB = Plus(conv, convB);
             out = RectifiedLinear(convPlusB);
         ]
diff --git a/Tests/EndToEndTests/Speech/LSTM/cntk.config b/Tests/EndToEndTests/Speech/LSTM/cntk.config
index de01d3d793cd..f8600809ab30 100644
--- a/Tests/EndToEndTests/Speech/LSTM/cntk.config
+++ b/Tests/EndToEndTests/Speech/LSTM/cntk.config
@@ -49,6 +49,7 @@ speechTrain = [
     ExperimentalNetworkBuilder=[
 
         WeightParam(m,n) = Parameter(m, n, init='uniform', initValueScale=1, initOnCPUOnly=true, randomSeed=1)
+        #BiasParam(m) = ParameterTensor(m, init='fixedValue', value=0.0)
         BiasParam(m) = Parameter(m, 1, init='fixedValue', value=0.0)
         ScalarParam() = Parameter(1, 1, init='fixedValue', value=0.0)
 
@@ -70,19 +71,19 @@ speechTrain = [
             PastValueShift(dimDummy, input) = Shift(input, /*fromOffsets=*/-1, /*boundaryValue=*/Constant(0.1), dim=-1)
             PastValue1 = PastValue
             #PastValue1 = PastValueShift
-            dh = PastValue1(outputDim, output);                   // hidden state(t-1)
-            dc = PastValue1(cellDim, ct);                         // cell(t-1)
+            dh = PastValue1(outputDim, output);                     // hidden state(t-1)
+            dc = PastValue1(cellDim, ct);                           // cell(t-1)
 
             // note: the W(inputx) here are all different, they all come with their own set of weights; same for H(dh), C(dc), and B()
-            it = Sigmoid(W(inputx) + B() + H(dh) + C(dc))       // input gate(t)
-            bit = it .* Tanh(W(inputx) + (H(dh) + B()))         // applied to tanh of input network
+            it = Sigmoid(W(inputx) + B() + H(dh) + C(dc))           // input gate(t)
+            bit = it .* Tanh(W(inputx) + (H(dh) + B()))             // applied to tanh of input network
 
-            ft = Sigmoid(W(inputx) + B() + H(dh) + C(dc))       // forget-me-not gate(t)
+            ft = Sigmoid(W(inputx) + B() + H(dh) + C(dc))           // forget-me-not gate(t)
             bft = ft .* dc                                          // applied to cell(t-1)
 
             ct = bft + bit                                          // c(t) is sum of both
 
-            ot = Sigmoid(W(inputx) + B() + H(dh) + C(ct))       // output gate(t)
+            ot = Sigmoid(W(inputx) + B() + H(dh) + C(ct))           // output gate(t)
             mt = ot .* Tanh(ct)                                     // applied to tanh(cell(t))
 
             output = Wmr * Stabilize(mt)                            // projection
@@ -90,7 +91,7 @@ speechTrain = [
 
         // define basic I/O
         baseFeatDim = 33
-        featDim = 11 * baseFeatDim      // TODO: 363--is this the correct explanation?
+        featDim = 11 * baseFeatDim
         labelDim = 132
 
         // hidden dimensions
@@ -101,7 +102,7 @@ speechTrain = [
         // features
         features = Input(featDim, tag='feature')
         labels = Input(labelDim, tag='label')
-        feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features);      # shift 5 frames right (x_{t+5} -> x_{t} )  // TODO why 5? Where do I see this?
+        feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features);      # shift 5 frames right (x_{t+5} -> x_{t} )  // TODO why 5? Where do I see this? Seems to be the last frame!
 
         featNorm = MeanVarNorm(feashift)