From e389459ca65517b928e0a6eddbd3bd3f906780a5 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Wed, 20 Jan 2016 23:28:33 -0800 Subject: [PATCH] LearnableParameter::InitRandom() now mimics the Matrix initialization behavior; bug fix: LookupTableNode used GetAsMatrixNumRows() on input[1] which is a minibatch; bug fix: Image/QuickE2E network definition updated to drop the now unnecessary extra column dimension; bug fix: TensorShape::IsDense() should not require m_offset to be 0 (column slices are perfectly fine) --- Source/Common/Include/TensorShape.h | 2 -- Source/ComputationNetworkLib/ComputationNode.h | 2 +- .../ComputationNetworkLib/InputAndParamNodes.h | 17 +++++++++++------ Tests/EndToEndTests/Image/QuickE2E/cntk.config | 4 ++-- Tests/EndToEndTests/Speech/LSTM/cntk.config | 17 +++++++++-------- 5 files changed, 23 insertions(+), 19 deletions(-) diff --git a/Source/Common/Include/TensorShape.h b/Source/Common/Include/TensorShape.h index 8d7bc6330e38..3bd3119fce71 100644 --- a/Source/Common/Include/TensorShape.h +++ b/Source/Common/Include/TensorShape.h @@ -388,8 +388,6 @@ struct TensorShape // verify that this refers to a dense matrix (no strides) void VerifyIsDense() const { - if (m_offset != 0) - LogicError("TensorShape: A dense TensorShape expected. Offset %d not allowed.", (int) m_offset); for (size_t k = 0; k < m_dims.size(); k++) // (TODO: we can save one multiplication here) { ptrdiff_t stride = k > 0 ? m_strides[k - 1] * (ptrdiff_t) m_dims[k - 1] : 1; diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index e54c0f34e635..8d4ee3bbba8e 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -393,7 +393,7 @@ class ComputationNodeBase : public IComputationNode, if (HasMBLayout()) LogicError("CheckTensorIsMatrix: Minibatch data cannot be interpreted as a single 2D tensor."); else if (m_sampleLayout.GetRank() < 1 || m_sampleLayout.GetRank() > 2) // note: scalars are not stored as tensors of rank 0, but rather as 1-dim vectors. TODO: clean this up some day - LogicError("CheckTensorIsMatrix: Sample is now a 2D tensor."); + LogicError("CheckTensorIsMatrix: Sample is not a column vector or matrix (1D or 2D tensor)."); } public: size_t GetAsMatrixNumRows() const diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index 8643769011dc..ae1c94783a1b 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -130,21 +130,26 @@ class LearnableParameter : public ComputationNode, public NumInputs<0> // the random seed offset is set via the "randomSeedOffset" parameter in config if (initOnCPUOnly) - m_value->TransferToDeviceIfNotThereAndNotAutoPlace(CPUDEVICE, true); + Value().TransferToDeviceIfNotThereAndNotAutoPlace(CPUDEVICE, true); +#if 1 // this more complex version is needed to repro test cases generated with an older version + auto value = GetSampleLayout().GetRank() > 2 ? Value() : ValueAsMatrix(); +#else + auto value = Value(); +#endif if (uniformInit) { // TODO: move these hidden extra factors out from here and into NDL, and make them visible in BS ElemType randRange = 0.05f * initValueScale; - Value().SetUniformRandomValue(-randRange, randRange, randomSeed); + value.SetUniformRandomValue(-randRange, randRange, randomSeed); } else { size_t inputSize = GetAsMatrixNumCols(); ElemType randInitstd = 0.2f * initValueScale / sqrt(ElemType(inputSize)); - Value().SetGaussianRandomValue(0, randInitstd, randomSeed); + value.SetGaussianRandomValue(0, randInitstd, randomSeed); } if (initOnCPUOnly) - m_value->TransferToDeviceIfNotThereAndNotAutoPlace(m_deviceId, true); + Value().TransferToDeviceIfNotThereAndNotAutoPlace(m_deviceId, true); } // initialize by reading a matrix from a text file @@ -492,10 +497,10 @@ class LookupTableNode : public ComputationNode, public NumInputs<2> if (isFinalValidationPass && !HasMBLayout()) InvalidArgument("%ls %ls operation can only operate on minibatches.", NodeName().c_str(), OperationName().c_str()); - if (isFinalValidationPass && Input(1)->GetAsMatrixNumRows() % Input(0)->GetAsMatrixNumCols() != 0) + if (isFinalValidationPass && Input(1)->GetSampleMatrixNumRows() % Input(0)->GetAsMatrixNumCols() != 0) InvalidArgument("Mismatched dimension. Rows in input1 must be multiples of cols in input0."); - int wordsInEachSample = Input(1)->GetAsMatrixNumRows() / Input(0)->GetAsMatrixNumCols(); + size_t wordsInEachSample = Input(1)->GetSampleMatrixNumRows() / Input(0)->GetAsMatrixNumCols(); // TODO: Should this add a tensor dimension? SetDims(TensorShape(Input(0)->GetSampleMatrixNumRows() * wordsInEachSample), true); diff --git a/Tests/EndToEndTests/Image/QuickE2E/cntk.config b/Tests/EndToEndTests/Image/QuickE2E/cntk.config index eb49229c9be8..470b895a1585 100644 --- a/Tests/EndToEndTests/Image/QuickE2E/cntk.config +++ b/Tests/EndToEndTests/Image/QuickE2E/cntk.config @@ -33,8 +33,8 @@ train = [ convW = Parameter(outMap, inWCount, init="uniform", initValueScale=wScale, initOnCPUOnly=false) conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding=false, imageLayout=if useCuDnn then "cudnn" else "legacy") convB = if useCuDnn - then ParameterTensor((1 : 1 : outMap : 1/*col dim*/), init="fixedValue", value=bValue) - else Parameter(outMap, 1, init="fixedValue", value=bValue) + then ParameterTensor((1 : 1 : outMap), init="fixedValue", value=bValue) + else Parameter(outMap, 1, init="fixedValue", value=bValue) convPlusB = Plus(conv, convB); out = RectifiedLinear(convPlusB); ] diff --git a/Tests/EndToEndTests/Speech/LSTM/cntk.config b/Tests/EndToEndTests/Speech/LSTM/cntk.config index de01d3d793cd..f8600809ab30 100644 --- a/Tests/EndToEndTests/Speech/LSTM/cntk.config +++ b/Tests/EndToEndTests/Speech/LSTM/cntk.config @@ -49,6 +49,7 @@ speechTrain = [ ExperimentalNetworkBuilder=[ WeightParam(m,n) = Parameter(m, n, init='uniform', initValueScale=1, initOnCPUOnly=true, randomSeed=1) + #BiasParam(m) = ParameterTensor(m, init='fixedValue', value=0.0) BiasParam(m) = Parameter(m, 1, init='fixedValue', value=0.0) ScalarParam() = Parameter(1, 1, init='fixedValue', value=0.0) @@ -70,19 +71,19 @@ speechTrain = [ PastValueShift(dimDummy, input) = Shift(input, /*fromOffsets=*/-1, /*boundaryValue=*/Constant(0.1), dim=-1) PastValue1 = PastValue #PastValue1 = PastValueShift - dh = PastValue1(outputDim, output); // hidden state(t-1) - dc = PastValue1(cellDim, ct); // cell(t-1) + dh = PastValue1(outputDim, output); // hidden state(t-1) + dc = PastValue1(cellDim, ct); // cell(t-1) // note: the W(inputx) here are all different, they all come with their own set of weights; same for H(dh), C(dc), and B() - it = Sigmoid(W(inputx) + B() + H(dh) + C(dc)) // input gate(t) - bit = it .* Tanh(W(inputx) + (H(dh) + B())) // applied to tanh of input network + it = Sigmoid(W(inputx) + B() + H(dh) + C(dc)) // input gate(t) + bit = it .* Tanh(W(inputx) + (H(dh) + B())) // applied to tanh of input network - ft = Sigmoid(W(inputx) + B() + H(dh) + C(dc)) // forget-me-not gate(t) + ft = Sigmoid(W(inputx) + B() + H(dh) + C(dc)) // forget-me-not gate(t) bft = ft .* dc // applied to cell(t-1) ct = bft + bit // c(t) is sum of both - ot = Sigmoid(W(inputx) + B() + H(dh) + C(ct)) // output gate(t) + ot = Sigmoid(W(inputx) + B() + H(dh) + C(ct)) // output gate(t) mt = ot .* Tanh(ct) // applied to tanh(cell(t)) output = Wmr * Stabilize(mt) // projection @@ -90,7 +91,7 @@ speechTrain = [ // define basic I/O baseFeatDim = 33 - featDim = 11 * baseFeatDim // TODO: 363--is this the correct explanation? + featDim = 11 * baseFeatDim labelDim = 132 // hidden dimensions @@ -101,7 +102,7 @@ speechTrain = [ // features features = Input(featDim, tag='feature') labels = Input(labelDim, tag='label') - feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features); # shift 5 frames right (x_{t+5} -> x_{t} ) // TODO why 5? Where do I see this? + feashift = RowSlice(featDim - baseFeatDim, baseFeatDim, features); # shift 5 frames right (x_{t+5} -> x_{t} ) // TODO why 5? Where do I see this? Seems to be the last frame! featNorm = MeanVarNorm(feashift)