From ea309aa1eeed6958c7dc2e5ffa7316509eb3dc77 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Mon, 14 Dec 2015 17:33:58 -0800 Subject: [PATCH 01/49] Replace CreateMatrixIfNull by MarkValueNonsharable() In the compiling the stage, we will mark nodes as nonsharable whose descendents are all learnable parameters. --- .../CompositeComputationNodes.h | 6 ++-- .../ComputationNetwork.h | 1 + .../ComputationNetworkEvaluation.cpp | 29 +++++++++++++++++++ .../ComputationNode.h | 23 +++++++++++++-- .../CNTKComputationNetworkLib/EsotericNodes.h | 6 ++-- .../InputAndParamNodes.h | 6 ++-- .../RecurrentNodes.h | 3 +- 7 files changed, 65 insertions(+), 9 deletions(-) diff --git a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h index 6d983a9784fe..8b11e37233e0 100644 --- a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h +++ b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h @@ -233,7 +233,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { virtual void MarkComputed(const bool hasComputed) { m_hasComputed = hasComputed; - CreateMatrixIfNull(m_value); + // CreateMatrixIfNull(m_value); + MarkValueNonSharable(); } virtual bool RequiresPreCompute() const override { return true; } @@ -292,7 +293,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // this is for the special case: convertDBN needs this; because we initialize values directly from another well-trained model virtual void SideLoadFromMatrix(const Matrix& value) { - CreateMatrixIfNull(m_value); + //CreateMatrixIfNull(m_value); + MarkValueNonSharable(); m_value->SetValue(value); m_hasComputed = true; } diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h index 45093c903cb2..cc202c83e35b 100644 --- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h +++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h @@ -159,6 +159,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb private: void ValidateNodes(list nodes, bool isFinalValidationPass, size_t & todo); void ValidateSubNetwork(const ComputationNodeBasePtr& rootNode); + void MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode); private: void DetermineSetOfAllRoots(); void CollectInputAndLearnableParameters(const ComputationNodeBasePtr& rootNode); diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp b/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp index f0b8a78dcd82..5baae8553fb4 100644 --- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp +++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp @@ -413,6 +413,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { for (auto & node : m_allRoots) ValidateSubNetwork(node); + // STEP: mark non-sharable function values + // if all the descendants of a particular node are learnable parameters, + // its function value is not sharable + for (auto & node : m_allRoots) + MarkValueNonSharableNodes(node); + + // STEP: Optimize the network. // :) @@ -678,6 +685,28 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } + // mark nodes that are purely induced by parameters as non-sharable and create space for value if null + void ComputationNetwork::MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode) + { + const auto & nodes = GetEvalOrder(rootNode); + for (auto& node : nodes) + { + auto children = node->GetInputs(); + bool allChildrenNonSharable = true; + for (auto& child : children) + { + if (child->isValueSharable()) + { + allChildrenNonSharable = false; + break; + } + } + if (allChildrenNonSharable) + node->MarkValueNonSharable(); + } + + } + #if 0 // prepare to compute with the subnetwork that this rootNode depends on, including // - auto-detecting recurrent loops diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h index ab12cdf8707d..55546f00894b 100644 --- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h +++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h @@ -246,7 +246,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_deviceId(deviceId), m_outputNeededDuringBackprop(true), m_parameterUpdateRequired(false), m_gradientInitialized(false), m_nodeName(name == L"" ? CreateUniqNodeName() : name), - m_numRows(0), m_numCols(0) + m_numRows(0), m_numCols(0), m_valueSharable(true) { } virtual ~ComputationNodeBase(){} @@ -428,6 +428,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { LogicError("VerifyNumParallelSequences: value inconsistent with MB layout"); } + bool isValueSharable() + { + return m_valueSharable; + } + virtual void MarkValueNonSharable() + { + m_valueSharable = false; + } protected: public: // ...the following should be protected, but nodes inquire about their children, requiring public access @@ -760,6 +768,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { bool m_parameterUpdateRequired; // update parameters? Only used for LearnableParameters. --TODO: Should we make this a member of LearnableParameters actually? And require a type cast? Currently it is read out for all leaves. bool m_gradientInitialized; // indicates whether the gradient matrix has been resized and initialized to 0 bool m_outputNeededDuringBackprop; // indicates whether the output value of the node is needed during backprop + + // flags related with sharable values + bool m_valueSharable; // whether value is sharable }; typedef ComputationNodeBase::ComputationNodeBasePtr ComputationNodeBasePtr; @@ -807,7 +818,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Since the dimensions are read as well, this function also updates m_numRows/m_numCols. void LoadValue(File& fstream) { - CreateMatrixIfNull(m_value); + // CreateMatrixIfNull(m_value); + MarkValueNonSharable(); fstream >> Value(); // above reads dimensions, so we must update our own m_numRows/m_numCols m_numRows = Value().GetNumRows(); @@ -1293,6 +1305,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { CreateMatrixIfNull(m_gradient); } + void MarkValueNonSharable() override + { + m_valueSharable = false; + CreateMatrixIfNull(m_value); + } + + protected: // this function is used to create matrices for those needed before matrix pool is available diff --git a/MachineLearning/CNTKComputationNetworkLib/EsotericNodes.h b/MachineLearning/CNTKComputationNetworkLib/EsotericNodes.h index ab4f69592f03..03b5a7c2aaa8 100644 --- a/MachineLearning/CNTKComputationNetworkLib/EsotericNodes.h +++ b/MachineLearning/CNTKComputationNetworkLib/EsotericNodes.h @@ -653,7 +653,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { void Init(size_t row_size, size_t col_size) { - CreateMatrixIfNull(m_value); + // CreateMatrixIfNull(m_value); + MarkValueNonSharable(); SetDims(row_size, col_size); UpdateFunctionValuesSize(); } @@ -663,7 +664,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { Base(deviceId, name) { Init(row_size, col_size); - CreateMatrixIfNull(m_gradient); + //CreateMatrixIfNull(m_gradient); + MarkValueNonSharable(); m_gradient->Resize(row_size, col_size); m_gradient->SetValue(0.0f); } diff --git a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h index 0bcaeeb3a170..dab53fb0a86c 100644 --- a/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h +++ b/MachineLearning/CNTKComputationNetworkLib/InputAndParamNodes.h @@ -48,7 +48,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_parameterUpdateRequired = true; m_sampleLayout = ImageLayoutWHC(1, rows, 1); // TODO: Is ^^ this a wise choice? These are often weight matrices, where rows, not columns, are multiplied with input vectors. - CreateMatrixIfNull(m_value); + //CreateMatrixIfNull(m_value); + MarkValueNonSharable(); SetDims(rows, cols); UpdateFunctionValuesSize(); // this allocates the matrix Value().SetValue(0); @@ -235,7 +236,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { void Init(size_t rows, size_t cols, bool isSparse) { m_isSparse = isSparse; - CreateMatrixIfNull(m_value); + //CreateMatrixIfNull(m_value); + MarkValueNonSharable(); if (isSparse) ConvertToSparseMatrix(); diff --git a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h index eb940a75ee46..a59d89bff6a6 100644 --- a/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h +++ b/MachineLearning/CNTKComputationNetworkLib/RecurrentNodes.h @@ -90,7 +90,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { { m_initialActivationValue = initialActivationValue; m_timeStep = 1; - CreateMatrixIfNull(m_value); + // CreateMatrixIfNull(m_value); + MarkValueNonSharable(); SetDims(row_size, col_size); m_isHistoryCarryOverManagedExternally = false; // used for PairNetworkNode/PastValueNode combination } From 2e49d617d6518f747ed1140b034d2e58b68fb4aa Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Wed, 16 Dec 2015 15:54:08 -0800 Subject: [PATCH 02/49] Revise the implementation of valueNotSharableNode. More to be revised. --- .../ComputationNetworkEvaluation.cpp | 47 ++++++++++++++++--- .../ComputationNetworkLib/ComputationNode.h | 24 +++++----- .../InputAndParamNodes.h | 2 + 3 files changed, 54 insertions(+), 19 deletions(-) diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp index 5baae8553fb4..b983cd489825 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp @@ -15,6 +15,7 @@ #include #include #include +#include using namespace std; @@ -689,20 +690,52 @@ namespace Microsoft { namespace MSR { namespace CNTK { void ComputationNetwork::MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode) { const auto & nodes = GetEvalOrder(rootNode); + std::map allLeafDescendentsAreParameters; for (auto& node : nodes) { auto children = node->GetInputs(); - bool allChildrenNonSharable = true; - for (auto& child : children) + wstring myname = node->NodeName(); + bool allParameters = true; + + if (children.size()) // we don't do the check for leaf node, cause all the possible leaf nodes (input/parameters/precompute node) are marked as non-sharable already { - if (child->isValueSharable()) + for (auto child : children) { - allChildrenNonSharable = false; - break; + wstring ChildName = child->NodeName(); + if (allLeafDescendentsAreParameters.find(ChildName) == allLeafDescendentsAreParameters.end()) + { + // not found, means it is a leaf node (we are at eval order ) + assert(child->IsLeaf()); + if (node->isLearnableParameter()) + { + allLeafDescendentsAreParameters[ChildName] = true; + } + else + { + allParameters = false; + allLeafDescendentsAreParameters[ChildName] = false; + break; + } + } + else + { + if (allLeafDescendentsAreParameters[ChildName] == false) + { + allParameters = false; + break; + } + } + } + allLeafDescendentsAreParameters[myname] = allParameters; + if (allParameters) + { + node->MarkValueNonSharable(); + } + else + { + node->MarkValueSharable(); } } - if (allChildrenNonSharable) - node->MarkValueNonSharable(); } } diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 31154b634f4e..81d1b640a5b8 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -148,7 +148,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { friend class ComputationNetwork; ComputationNetworkOwnedNodeState() : - m_needsGradient(false) + m_needsGradient(false), m_valueSharable(true) { PurgeStateForFormingRecurrentLoops(); m_isPartOfLoop = false; @@ -163,10 +163,17 @@ namespace Microsoft { namespace MSR { namespace CNTK { bool IsPartOfLoop() const { return m_isPartOfLoop; } + virtual void MarkValueNonSharable(){ m_valueSharable = false; } + virtual void MarkValueSharable() { m_valueSharable = true; } + bool isValueSharable() { return m_valueSharable; } + protected: // TODO: should be fully encapsulated here bool m_needsGradient; // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree) + bool m_valueSharable; // a flag is needed for memory share. + // If it is false (e.g., learnableParameters/InputValue and those nodes are solely induced by learnableParameters), + // it will never be released to memory pool private: bool m_isPartOfLoop; // true if this loop is part of a recurrent loop @@ -247,7 +254,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_deviceId(deviceId), m_outputNeededDuringBackprop(true), m_parameterUpdateRequired(false), m_gradientInitialized(false), m_nodeName(name == L"" ? CreateUniqNodeName() : name), - m_numRows(0), m_numCols(0), m_valueSharable(true) + m_numRows(0), m_numCols(0) { } virtual ~ComputationNodeBase(){} @@ -429,14 +436,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { LogicError("VerifyNumParallelSequences: value inconsistent with MB layout"); } - bool isValueSharable() - { - return m_valueSharable; - } - virtual void MarkValueNonSharable() - { - m_valueSharable = false; - } + // sometimes, it is necessary to know whether it is a particular node (e.g., learnable parameter) + virtual bool isLearnableParameter() const { return false; } + protected: public: // ...the following should be protected, but nodes inquire about their children, requiring public access @@ -770,8 +772,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { bool m_gradientInitialized; // indicates whether the gradient matrix has been resized and initialized to 0 bool m_outputNeededDuringBackprop; // indicates whether the output value of the node is needed during backprop - // flags related with sharable values - bool m_valueSharable; // whether value is sharable }; typedef ComputationNodeBase::ComputationNodeBasePtr ComputationNodeBasePtr; diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index dab53fb0a86c..d2e0f8039763 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -180,6 +180,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { PrintNodeValuesToFile(printValues, fstream); } + + virtual bool isLearnableParameter()const override{ return true; } }; #if 0 From 8eae46b59df4973d508f5ac3eaa8dd434b6f0a35 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Thu, 17 Dec 2015 17:44:22 -0800 Subject: [PATCH 03/49] Fix MarkValueNotSharableNodes --- .../ComputationNetwork.h | 2 +- .../ComputationNetworkEvaluation.cpp | 90 +++++++++---------- .../ComputationNetworkLib/ComputationNode.h | 6 +- .../InputAndParamNodes.h | 1 - 4 files changed, 48 insertions(+), 51 deletions(-) diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h index cc202c83e35b..5b3a0f16adf5 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.h +++ b/Source/ComputationNetworkLib/ComputationNetwork.h @@ -159,7 +159,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb private: void ValidateNodes(list nodes, bool isFinalValidationPass, size_t & todo); void ValidateSubNetwork(const ComputationNodeBasePtr& rootNode); - void MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode); + void MarkValueNonSharableNodes(); private: void DetermineSetOfAllRoots(); void CollectInputAndLearnableParameters(const ComputationNodeBasePtr& rootNode); diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp index b983cd489825..f8b0b0cabd8c 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp @@ -10,6 +10,7 @@ #include "ComputationNode.h" #include "ComputationNetwork.h" #include "RecurrentNodes.h" +#include "InputAndParamNodes.h" #include #include #include @@ -414,13 +415,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { for (auto & node : m_allRoots) ValidateSubNetwork(node); - // STEP: mark non-sharable function values - // if all the descendants of a particular node are learnable parameters, - // its function value is not sharable - for (auto & node : m_allRoots) - MarkValueNonSharableNodes(node); - - // STEP: Optimize the network. // :) @@ -686,11 +680,48 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } +#if 0 + // prepare to compute with the subnetwork that this rootNode depends on, including + // - auto-detecting recurrent loops + // - collect input and learnable nodes + // - calling Validate() on all nodes lazily, which sizes all matrices (column dimensions get updated to MB size) + // Done lazily, called for every minibatch's invocation of EvaluateNode(), but memoizing which nodes were done already. + // BUGBUG? Lazy triggers on the root node. I.e. for two different root nodes (training, eval), it validates twice. + void ComputationNetwork::BuildAndValidateSubNetwork(const ComputationNodeBasePtr rootNode) + { + bool inserted = m_built.insert(rootNode).second; // remember we built it + if (!inserted) + return; // already done + + // detect recurrent loops for this root node + // TODO: not nice--why not always call this in ValidateSubNetwork() only? + FormRecurrentLoops(rootNode); + + // for the m_inputValues and m_learnableParameters sets for this rootNode + CollectInputAndLearnableParameters(rootNode); + + // validate the rootNode and all nodes it depends on, in evaluation order + ValidateSubNetwork(rootNode); + } + + // tests whether BuildAndValidateSubNetwork() was called + bool ComputationNetwork::BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode) + { + return m_built.find(rootNode) != m_built.end(); + } +#endif + + // ----------------------------------------------------------------------- + // memory allocation + // ----------------------------------------------------------------------- // mark nodes that are purely induced by parameters as non-sharable and create space for value if null - void ComputationNetwork::MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode) + void ComputationNetwork::MarkValueNonSharableNodes() { - const auto & nodes = GetEvalOrder(rootNode); + const auto & nodes = GetEvalOrder(nullptr); std::map allLeafDescendentsAreParameters; + std::list allLearnableParameters = GetNodesWithType(OperationNameOf(LearnableParameter)); + // note that: we cannot use m_learnableParameters because we need all parameters node, regardless whether it requires update or not + for (auto& node : nodes) { auto children = node->GetInputs(); @@ -706,7 +737,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { // not found, means it is a leaf node (we are at eval order ) assert(child->IsLeaf()); - if (node->isLearnableParameter()) + if (std::find(allLearnableParameters.begin(), allLearnableParameters.end(), child)!= allLearnableParameters.end()) { allLeafDescendentsAreParameters[ChildName] = true; } @@ -740,40 +771,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { } -#if 0 - // prepare to compute with the subnetwork that this rootNode depends on, including - // - auto-detecting recurrent loops - // - collect input and learnable nodes - // - calling Validate() on all nodes lazily, which sizes all matrices (column dimensions get updated to MB size) - // Done lazily, called for every minibatch's invocation of EvaluateNode(), but memoizing which nodes were done already. - // BUGBUG? Lazy triggers on the root node. I.e. for two different root nodes (training, eval), it validates twice. - void ComputationNetwork::BuildAndValidateSubNetwork(const ComputationNodeBasePtr rootNode) - { - bool inserted = m_built.insert(rootNode).second; // remember we built it - if (!inserted) - return; // already done - - // detect recurrent loops for this root node - // TODO: not nice--why not always call this in ValidateSubNetwork() only? - FormRecurrentLoops(rootNode); - - // for the m_inputValues and m_learnableParameters sets for this rootNode - CollectInputAndLearnableParameters(rootNode); - - // validate the rootNode and all nodes it depends on, in evaluation order - ValidateSubNetwork(rootNode); - } - - // tests whether BuildAndValidateSubNetwork() was called - bool ComputationNetwork::BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode) - { - return m_built.find(rootNode) != m_built.end(); - } -#endif - - // ----------------------------------------------------------------------- - // memory allocation - // ----------------------------------------------------------------------- // this function will need to be called before actual validation and execution to // predetermine how to share matrices to reduce memory usage. @@ -788,9 +785,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { VerifyIsCompiled("AllocateAllMatrices"); + // Due to special topology, if a node is solely induced by parameters, its function value should not be shared + MarkValueNonSharableNodes(); + bool performingBackPropagation = (trainRootNode != nullptr); - // Create a composite Eval order with the specfied nodes as roots + // Create a composite Eval order with the specified nodes as roots std::vector forwardPropRoots; forwardPropRoots.insert(forwardPropRoots.end(), evalRootNodes.begin(), evalRootNodes.end()); forwardPropRoots.insert(forwardPropRoots.end(), outValueRootNodes.begin(), outValueRootNodes.end()); diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 81d1b640a5b8..45ceacb7ffd3 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -165,7 +165,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { virtual void MarkValueNonSharable(){ m_valueSharable = false; } virtual void MarkValueSharable() { m_valueSharable = true; } - bool isValueSharable() { return m_valueSharable; } + bool isValueSharable() const { return m_valueSharable; } protected: // TODO: should be fully encapsulated here @@ -436,8 +436,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { LogicError("VerifyNumParallelSequences: value inconsistent with MB layout"); } - // sometimes, it is necessary to know whether it is a particular node (e.g., learnable parameter) - virtual bool isLearnableParameter() const { return false; } protected: public: // ...the following should be protected, but nodes inquire about their children, requiring public access @@ -518,7 +516,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; } bool IsOutputNeededDuringBackprop() const { - return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop; + return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop || !isValueSharable(); } virtual void /*IComputationNode::*/InferImageDimsFromInputs() diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index d2e0f8039763..eb4b2f1bf3fe 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -181,7 +181,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { PrintNodeValuesToFile(printValues, fstream); } - virtual bool isLearnableParameter()const override{ return true; } }; #if 0 From 88be36aaf0ae753afdba65346c69c8f237b7aee1 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Fri, 18 Dec 2015 14:21:21 -0800 Subject: [PATCH 04/49] Revise the condition of ReleaseMatricesAfterForwardProp: only ValueSharable nodes can be released after forwardprop --- Source/ComputationNetworkLib/ComputationNode.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 45ceacb7ffd3..9314bc88abea 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -516,7 +516,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; } bool IsOutputNeededDuringBackprop() const { - return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop || !isValueSharable(); + return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop ; } virtual void /*IComputationNode::*/InferImageDimsFromInputs() @@ -905,7 +905,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { //don't release matrices that need to be used in the gradient computation virtual void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool) { - if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE)) + if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE) && isValueSharable()) ReleaseMatrixToPool(m_value, matrixPool); } From 63eca285983e016fda12131e8a05854a35dec497 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Fri, 18 Dec 2015 23:32:59 -0800 Subject: [PATCH 05/49] Fix a bug in MarkValueSharableNode --- Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp | 2 +- Source/ComputationNetworkLib/ComputationNode.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp index f8b0b0cabd8c..3b8f95064282 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp @@ -736,7 +736,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (allLeafDescendentsAreParameters.find(ChildName) == allLeafDescendentsAreParameters.end()) { // not found, means it is a leaf node (we are at eval order ) - assert(child->IsLeaf()); + assert(child->IsLeaf() || child->IsPartOfLoop()); if (std::find(allLearnableParameters.begin(), allLearnableParameters.end(), child)!= allLearnableParameters.end()) { allLeafDescendentsAreParameters[ChildName] = true; diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 9314bc88abea..f2eb7747ed0f 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -934,7 +934,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Release the Value matrix only if the output value is needed during backprop // since in the case it isn't used, we release it during forward prop itself - if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE) + if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE && isValueSharable()) ReleaseMatrixToPool(m_value, matrixPool); } } From 0005c81a76dfc8d428d31dbb8f10ec3267ec8014 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Sat, 19 Dec 2015 00:18:13 -0800 Subject: [PATCH 06/49] Add an alternate option "numSubminibatches" for users to indicate how to split minibatches into subminibatches. --- Source/SGDLib/SGD.cpp | 25 ++++++++++++++++++------- Source/SGDLib/SGD.h | 6 +++++- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp index 27998d7d6bd0..e574c93ec51c 100644 --- a/Source/SGDLib/SGD.cpp +++ b/Source/SGDLib/SGD.cpp @@ -764,13 +764,20 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Sub-minibatching is used if a single minibatch is too large to fit into GPU RAM. DataReaderHelpers::SubminibatchDispatcher smbDispatcher; size_t numSubminibatchesNeeded = 0; - if (m_maxSamplesInRAM < SIZE_MAX) // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled + if (m_maxSamplesInRAM < SIZE_MAX || m_numSubminiBatches > 1) // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled { - // into how many pieces would we need to break the minibatch? - // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed. - size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences(); - size_t estimatedMBSize = tunedMBSize * numParallelSequences; - numSubminibatchesNeeded = (size_t)std::ceil((float)estimatedMBSize / m_maxSamplesInRAM); + if (m_maxSamplesInRAM < SIZE_MAX) + { + // into how many pieces would we need to break the minibatch? + // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed. + size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences(); + size_t estimatedMBSize = tunedMBSize * numParallelSequences; + numSubminibatchesNeeded = (size_t)std::ceil((float)estimatedMBSize / m_maxSamplesInRAM); + } + if (m_numSubminiBatches > 1) + { + numSubminibatchesNeeded = m_numSubminiBatches; + } } // this is non-trivial, we need a manager object to handle this if (numSubminibatchesNeeded > 1) @@ -800,7 +807,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { } if (numSubminibatchesNeeded > 1) { - fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM); + if (m_maxSamplesInRAM < SIZE_MAX) + fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM); + else + fprintf(stderr, ", with %d subminibatch", (int)numSubminibatchesNeeded); } fprintf(stderr, ".\n"); @@ -2484,6 +2494,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_mbSize = configSGD(L"minibatchSize", ConfigRecordType::Array(intargvector(vector{ 256 }))); m_truncated = configSGD(L"truncated", false); m_maxSamplesInRAM = configSGD(L"maxSamplesInRAM", (size_t)SIZE_MAX); + m_numSubminiBatches = configSGD(L"numSubminibatches", (size_t)1); // the number of samples in each epoch (0 means, use all the samples in each epoch). m_epochSize = configSGD(L"epochSize", (size_t)0); diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h index 15143dfa0208..b99608b500f8 100644 --- a/Source/SGDLib/SGD.h +++ b/Source/SGDLib/SGD.h @@ -157,7 +157,11 @@ struct SGDParams : public ScriptableObjects::Object // To mitigate this issue, we adopt the sub-minibatch implementation, where // each m_mbSize[epoch] is divided by a few sub-minibatch of which size will be no more than m_maxSamplesInRAM // a forward-backward is performed for each sub-minibathch; a model update is performed after each minibatch - + size_t m_numSubminiBatches; + // alternative method to specify how to split minibatches into subminibatches + // default is 1, which means no subminibatch is used + // if m_maxTempMemSizeInSamples = SIZE_MAX (which means users do not specify the option) and m_numSubminiBatches > 1 + // we divide one minibatch to m_numSubminiBatches subMinibatches // the number of samples in each epoch (0 means, use all the samples in each epoch). size_t m_epochSize; From cba311ed72cd0876c9dca31cdc2880e1b6e79d1f Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Mon, 21 Dec 2015 21:56:33 -0800 Subject: [PATCH 07/49] Display CUB and CUDNN paths (if defined) in BuildInfo Print BuildInfo at the very begining of the program. convenient for checking build type. --- Source/CNTK/CNTK.cpp | 8 ++++++-- Source/CNTK/prebuild.bat | 10 ++++++++++ Tools/generate_build_info | 3 +++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp index f2bd706bc773..d75d956e9c60 100644 --- a/Source/CNTK/CNTK.cpp +++ b/Source/CNTK/CNTK.cpp @@ -1667,6 +1667,9 @@ void PrintBuiltInfo() #ifdef _CUB_PATH_ fprintf(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_); #endif +#ifdef _CUDNN_PATH_ + fprintf(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_); +#endif #ifdef _GIT_EXIST fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_); fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_); @@ -1885,7 +1888,7 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which i RedirectStdErr(logpath); } - PrintBuiltInfo(); + PrintBuiltInfo(); // this one goes to log file std::string timestamp = TimeDateStamp(); //dump config info @@ -1960,10 +1963,11 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which i // main wrapper that catches C++ exceptions and prints them // --------------------------------------------------------------------------- -int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & repots Win32 exceptions +int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & reports Win32 exceptions { try { + PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type) if (argc <= 1) InvalidArgument("No command-line argument given."); // detect legacy CNTK configuration diff --git a/Source/CNTK/prebuild.bat b/Source/CNTK/prebuild.bat index 9f841d104da7..12631cf52e37 100644 --- a/Source/CNTK/prebuild.bat +++ b/Source/CNTK/prebuild.bat @@ -33,6 +33,16 @@ if "%cuda_path%" == "" ( echo #define _CUDA_PATH_ "%cuda_path:\=\\%" >> buildinfo.h$$ ) +if not "%cudnn_path%" == "" ( + echo #define _CUDNN_PATH_ "%cudnn_path:\=\\%" >> buildinfo.h$$ + ) + +if not "%cub_path%" == "" ( + echo #define _CUB_PATH_ "%cub_path:\=\\%" >> buildinfo.h$$ + ) + + + echo #endif >> buildinfo.h$$ ::: update file only if it changed (otherwise CNTK.cpp will get rebuilt each time) diff --git a/Tools/generate_build_info b/Tools/generate_build_info index a155fc84e792..62686222ef33 100755 --- a/Tools/generate_build_info +++ b/Tools/generate_build_info @@ -56,6 +56,9 @@ makebuildinfo() if [ ! -z "$CUB_PATH" ]; then printf "#define _CUB_PATH_ \"%s\"\n" $CUB_PATH >> $target fi + if [ ! -z "$CUDNN_PATH" ]; then + printf "#define _CUDNN_PATH_ \"%s\"\n" $CUDNN_PATH >> $target + fi printf "#define _BUILDTYPE_ \"%s\"\n" $BUILDTYPE >> $target printf "#endif\n" >> $target } From 762a5dd80f780037c47245167e23021d7b07d807 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Mon, 21 Dec 2015 23:20:13 -0800 Subject: [PATCH 08/49] Move MarkValueNonsharable out of constuctors (make gcc happy) --- Source/ComputationNetworkLib/CompositeComputationNodes.h | 6 ++---- Source/ComputationNetworkLib/ComputationNode.h | 3 +-- Source/ComputationNetworkLib/EsotericNodes.h | 6 ++---- Source/ComputationNetworkLib/InputAndParamNodes.h | 4 +--- Source/ComputationNetworkLib/RecurrentNodes.h | 3 +-- 5 files changed, 7 insertions(+), 15 deletions(-) diff --git a/Source/ComputationNetworkLib/CompositeComputationNodes.h b/Source/ComputationNetworkLib/CompositeComputationNodes.h index 8b11e37233e0..6d983a9784fe 100644 --- a/Source/ComputationNetworkLib/CompositeComputationNodes.h +++ b/Source/ComputationNetworkLib/CompositeComputationNodes.h @@ -233,8 +233,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { virtual void MarkComputed(const bool hasComputed) { m_hasComputed = hasComputed; - // CreateMatrixIfNull(m_value); - MarkValueNonSharable(); + CreateMatrixIfNull(m_value); } virtual bool RequiresPreCompute() const override { return true; } @@ -293,8 +292,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // this is for the special case: convertDBN needs this; because we initialize values directly from another well-trained model virtual void SideLoadFromMatrix(const Matrix& value) { - //CreateMatrixIfNull(m_value); - MarkValueNonSharable(); + CreateMatrixIfNull(m_value); m_value->SetValue(value); m_hasComputed = true; } diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 4e59773afbbf..62cc38bb56f9 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -804,8 +804,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Since the dimensions are read as well, this function also updates m_numRows/m_numCols. void LoadValue(File& fstream) { - // CreateMatrixIfNull(m_value); - MarkValueNonSharable(); + CreateMatrixIfNull(m_value); fstream >> Value(); // above reads dimensions, so we must update our own m_numRows/m_numCols m_numRows = Value().GetNumRows(); diff --git a/Source/ComputationNetworkLib/EsotericNodes.h b/Source/ComputationNetworkLib/EsotericNodes.h index 9e7c7517dc34..85b6ca8da8e4 100644 --- a/Source/ComputationNetworkLib/EsotericNodes.h +++ b/Source/ComputationNetworkLib/EsotericNodes.h @@ -653,8 +653,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { void Init(size_t row_size, size_t col_size) { - // CreateMatrixIfNull(m_value); - MarkValueNonSharable(); + CreateMatrixIfNull(m_value); SetDims(row_size, col_size); UpdateFunctionValuesSize(); } @@ -664,8 +663,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { Base(deviceId, name) { Init(row_size, col_size); - //CreateMatrixIfNull(m_gradient); - MarkValueNonSharable(); + CreateMatrixIfNull(m_gradient); m_gradient->Resize(row_size, col_size); m_gradient->SetValue(0.0f); } diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index eb4b2f1bf3fe..778f68a28924 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -48,8 +48,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_parameterUpdateRequired = true; m_sampleLayout = ImageLayoutWHC(1, rows, 1); // TODO: Is ^^ this a wise choice? These are often weight matrices, where rows, not columns, are multiplied with input vectors. - //CreateMatrixIfNull(m_value); - MarkValueNonSharable(); + CreateMatrixIfNull(m_value); SetDims(rows, cols); UpdateFunctionValuesSize(); // this allocates the matrix Value().SetValue(0); @@ -237,7 +236,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { void Init(size_t rows, size_t cols, bool isSparse) { m_isSparse = isSparse; - //CreateMatrixIfNull(m_value); MarkValueNonSharable(); if (isSparse) ConvertToSparseMatrix(); diff --git a/Source/ComputationNetworkLib/RecurrentNodes.h b/Source/ComputationNetworkLib/RecurrentNodes.h index e97a79e70c68..844bf237e7ff 100644 --- a/Source/ComputationNetworkLib/RecurrentNodes.h +++ b/Source/ComputationNetworkLib/RecurrentNodes.h @@ -90,8 +90,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { m_initialActivationValue = initialActivationValue; m_timeStep = 1; - // CreateMatrixIfNull(m_value); - MarkValueNonSharable(); + CreateMatrixIfNull(m_value); SetDims(row_size, col_size); m_isHistoryCarryOverManagedExternally = false; // used for PairNetworkNode/PastValueNode combination } From 1ca02625785add9f621463620424029c16cb50d3 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Tue, 22 Dec 2015 00:19:22 -0800 Subject: [PATCH 09/49] (further remove MarkValueNotSharable out of constructor) --- Source/ComputationNetworkLib/InputAndParamNodes.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index 778f68a28924..8948d8b36d03 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -236,7 +236,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { void Init(size_t rows, size_t cols, bool isSparse) { m_isSparse = isSparse; - MarkValueNonSharable(); if (isSparse) ConvertToSparseMatrix(); From 067bc561d72c49eea9ce6b7ef652763bbbc041b3 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Tue, 22 Dec 2015 12:18:21 -0800 Subject: [PATCH 10/49] (Fix a bug in MarkValueSharable) --- Source/ComputationNetworkLib/InputAndParamNodes.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index 8948d8b36d03..125e572a22fc 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -40,12 +40,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { Base(deviceId, name) { m_parameterUpdateRequired = true; + m_valueSharable = false; m_sampleLayout = ImageLayoutWHC(1, SIZE_MAX, 1); } LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols) : Base(deviceId, name) { m_parameterUpdateRequired = true; + m_valueSharable = false; m_sampleLayout = ImageLayoutWHC(1, rows, 1); // TODO: Is ^^ this a wise choice? These are often weight matrices, where rows, not columns, are multiplied with input vectors. CreateMatrixIfNull(m_value); @@ -236,12 +238,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { void Init(size_t rows, size_t cols, bool isSparse) { m_isSparse = isSparse; + CreateMatrixIfNull(m_value); if (isSparse) ConvertToSparseMatrix(); SetDims(rows, cols); UpdateFunctionValuesSize(); // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that) m_parameterUpdateRequired = false; + m_valueSharable = false; } protected: InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, bool isSparse) : From 3f987c03fd5342dee38a3b728bf466bec0b193b7 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Mon, 28 Dec 2015 15:51:08 -0800 Subject: [PATCH 11/49] Add an option "prefixPathInToc" in HTKMLFReader This options allows to specify data path relative to those in the TOC files. --- Source/Common/Include/latticearchive.h | 23 +++++++++++++++----- Source/Common/Include/latticesource.h | 11 ++++++++-- Source/Readers/HTKMLFReader/HTKMLFReader.cpp | 7 ++++-- 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/Source/Common/Include/latticearchive.h b/Source/Common/Include/latticearchive.h index d1411396d372..f034c11ced9d 100644 --- a/Source/Common/Include/latticearchive.h +++ b/Source/Common/Include/latticearchive.h @@ -1016,6 +1016,8 @@ class archive // set of lattice archive files referenced // Note that .toc files can be concatenated, i.e. one .toc file can reference multiple archive files. std::vector archivepaths; // [archiveindex] -> archive path + std::wstring prefixPathInToc; // prefix path in a toc; using this to avoid pushd some path before start training + mutable int verbosity; size_t getarchiveindex (const std::wstring & path) // get index of a path in archivepaths[]; create new entry if needed { auto iter = std::find (archivepaths.begin(), archivepaths.end(), path); @@ -1042,7 +1044,8 @@ class archive { // need to read the map and establish the mapping // get the symlist file const std::wstring symlistpath = archivepaths[archiveindex] + L".symlist"; - fprintf (stderr, "getcachedidmap: reading '%S'\n", symlistpath.c_str()); + if (verbosity>0) + fprintf (stderr, "getcachedidmap: reading '%S'\n", symlistpath.c_str()); std::vector textbuffer; auto lines = msra::files::fgetfilelines (symlistpath, textbuffer); // establish mapping of each entry to the corresponding id in 'symmap'; this should fail if the symbol is not found @@ -1092,19 +1095,25 @@ class archive public: // construct = open the archive //archive() : currentarchiveindex (SIZE_MAX) {} - + void setverbosity(int veb) const + { + verbosity = veb; + } // test if this object is loaded with anything (if not, an empty set of TOC paths was passed--meaning disable lattice mode) bool empty() const { return archivepaths.empty(); } // construct from a list of TOC files - archive (const std::vector & tocpaths, const std::unordered_map & modelsymmap) : currentarchiveindex (SIZE_MAX), modelsymmap (modelsymmap) + archive (const std::vector & tocpaths, const std::unordered_map & modelsymmap, const std::wstring prefixPath=L"") + : currentarchiveindex(SIZE_MAX), modelsymmap(modelsymmap), prefixPathInToc(prefixPath), verbosity(0) { if (tocpaths.empty()) // nothing to read--keep silent return; fprintf (stderr, "archive: opening %d lattice-archive TOC files ('%S' etc.)..", (int)tocpaths.size(), tocpaths[0].c_str()); + size_t onepercentage = tocpaths.size() / 100 ? tocpaths.size()/100 : 1; foreach_index (i, tocpaths) { - fprintf (stderr, "."); + if ( (i % onepercentage) == 0) + fprintf (stderr, "."); open (tocpaths[i]); } fprintf (stderr, " %d total lattices referenced in %d archive files\n", (int)toc.size(), (int)archivepaths.size()); @@ -1135,7 +1144,11 @@ class archive RuntimeError("open: invalid TOC line (no [): %s", line); if (q != p) { - const std::wstring archivepath = msra::strfun::utf16 (std::string (p, q - p)); + std::wstring archivepath = msra::strfun::utf16 (std::string (p, q - p)); + if (!prefixPathInToc.empty()) + { + archivepath = prefixPathInToc + L"/" + archivepath; + } // TODO: should we allow paths relative to TOC file? archiveindex = getarchiveindex (archivepath); } diff --git a/Source/Common/Include/latticesource.h b/Source/Common/Include/latticesource.h index fcf046b68908..0ec12508e9ca 100644 --- a/Source/Common/Include/latticesource.h +++ b/Source/Common/Include/latticesource.h @@ -23,10 +23,11 @@ class latticepair : public std::pair,std::vector> latticetocs, const std::unordered_map & modelsymmap) - : numlattices (latticetocs.first, modelsymmap), denlattices (latticetocs.second, modelsymmap) {} + latticesource (std::pair,std::vector> latticetocs, const std::unordered_map & modelsymmap, std::wstring RootPathInToc) + : numlattices (latticetocs.first, modelsymmap, RootPathInToc), denlattices (latticetocs.second, modelsymmap, RootPathInToc), verbosity(0) {} bool empty() const { @@ -52,6 +53,12 @@ class latticesource denlattices.getlattice (key, LP->second, expectedframes); // this loads the lattice from disk, using the existing L.second object L = LP; } + + void setverbosity(int veb) + { + verbosity = veb; + numlattices.setverbosity(veb); denlattices.setverbosity(veb); + } }; }} \ No newline at end of file diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp index 0db717a998f2..344883db85be 100644 --- a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp +++ b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp @@ -100,6 +100,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { vector scriptpaths; vector RootPathInScripts; + wstring RootPathInLatticeTocs; vector mlfpaths; vector>mlfpathsmulti; size_t firstfilesonly = SIZE_MAX; // set to a lower value for testing @@ -263,7 +264,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { expand_wildcards(thisLattice(L"numLatTocFile"), paths); latticetocs.first.insert(latticetocs.first.end(), paths.begin(), paths.end()); } - + RootPathInLatticeTocs = thisLattice(L"prefixPathInToc",L""); } //get HMM related file names @@ -448,7 +449,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (!_wcsicmp(readMethod.c_str(), L"blockRandomize")) { // construct all the parameters we don't need, but need to be passed to the constructor... - m_lattices.reset(new msra::dbn::latticesource(latticetocs, m_hset.getsymmap())); + + m_lattices.reset(new msra::dbn::latticesource(latticetocs, m_hset.getsymmap(), RootPathInLatticeTocs)); + m_lattices->setverbosity(m_verbosity); // now get the frame source. This has better randomization and doesn't create temp files m_frameSource.reset(new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, m_frameMode)); From edae2da54d99542657ea7f948d48c5b6bd8f85e6 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Mon, 28 Dec 2015 16:44:23 -0800 Subject: [PATCH 12/49] Make lattice stats printf controlled by trace in READER --- Source/Common/Include/latticearchive.h | 30 +++++++++++++------ .../parallelforwardbackward.cpp | 4 +-- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/Source/Common/Include/latticearchive.h b/Source/Common/Include/latticearchive.h index f034c11ced9d..ca489ad5cd35 100644 --- a/Source/Common/Include/latticearchive.h +++ b/Source/Common/Include/latticearchive.h @@ -51,6 +51,7 @@ enum mbrclassdefinition // used to identify definition of class in minimum b // =========================================================================== class lattice { + mutable int verbosity; struct header_v1_v2 { size_t numnodes : 32; @@ -567,11 +568,13 @@ class lattice std::vector backptroffsets; // TODO: we could change this to 'unsigned int' to save some transfer time std::vector backptrstorage; // CPU-side versions use this as the traceback buffer; CUDA code has its CUDA-side buffer size_t numofstates; // per sil hmm + int verbosity; public: - backpointers (const lattice & L, const msra::asr::simplesenonehmm & hset) : numofstates(0) + backpointers (const lattice & L, const msra::asr::simplesenonehmm & hset, int verbosity=0) : numofstates(0) { size_t edgeswithsilence = 0; // (diagnostics only: number of edges with at least one /sil/) size_t backptrbufsize = 0; // number of entries in buffer for silence backpointer array, used as cursor as we build it + backptroffsets.resize (L.edges.size() + 1); // +1, so that the final entry determines the overall size of the allocated buffer const size_t silUnitId = hset.gethmmid ("sil"); numofstates = hset.gethmm (silUnitId).getnumstates(); @@ -595,15 +598,18 @@ class lattice #if 1 // multiple /sil/ -> log this (as we are not sure whether this is actually proper--probably it is) if (numsilunits > 1) { - fprintf (stderr, "backpointers: lattice '%S', edge %d has %d /sil/ phonemes\n", L.getkey(), j, (int)numsilunits); - fprintf (stderr, "alignments: :"); - foreach_index (a, aligntokens) + if (verbosity) { - const auto & unit = aligntokens[a]; - const auto & hmm = hset.gethmm (unit.unit); - fprintf (stderr, "%s,%.2f:", hmm.getname(), unit.frames / 100.0f); + fprintf(stderr, "backpointers: lattice '%S', edge %d has %d /sil/ phonemes\n", L.getkey(), j, (int)numsilunits); + fprintf(stderr, "alignments: :"); + foreach_index(a, aligntokens) + { + const auto & unit = aligntokens[a]; + const auto & hmm = hset.gethmm(unit.unit); + fprintf(stderr, "%s,%.2f:", hmm.getname(), unit.frames / 100.0f); + } + fprintf(stderr, "\n"); } - fprintf (stderr, "\n"); } #endif if (numsilunits > 0) @@ -611,7 +617,8 @@ class lattice backptrbufsize += maxsilframes * numofstates; } backptroffsets[L.edges.size()] = backptrbufsize; // (TODO: remove if not actually needed) - fprintf (stderr, "backpointers: %.1f%% edges have at least one /sil/ unit inside\n", 100.0f * ((float) edgeswithsilence / L.edges.size())); + if (verbosity) + fprintf (stderr, "backpointers: %.1f%% edges have at least one /sil/ unit inside\n", 100.0f * ((float) edgeswithsilence / L.edges.size())); } // CUDA support const std::vector & getbackptroffsets() const { return backptroffsets; } @@ -1002,6 +1009,10 @@ class lattice std::wstring key; // (keep our own name (key) so we can identify ourselves for diagnostics messages) const wchar_t * getkey() const { return key.c_str(); } + + void setverbosity(int veb) const{ + verbosity = veb; + } }; // =========================================================================== @@ -1220,6 +1231,7 @@ class archive fsetpos (f, offset); // get it L.fread (f, idmap, spunit); + L.setverbosity(verbosity); #ifdef HACK_IN_SILENCE // hack to simulate DEL in the lattice const size_t silunit = getid (modelsymmap, "sil"); const bool addsp = true; diff --git a/Source/SequenceTrainingLib/parallelforwardbackward.cpp b/Source/SequenceTrainingLib/parallelforwardbackward.cpp index 3fb27b59fba1..bc4baaad9d8f 100644 --- a/Source/SequenceTrainingLib/parallelforwardbackward.cpp +++ b/Source/SequenceTrainingLib/parallelforwardbackward.cpp @@ -743,8 +743,8 @@ namespace msra { namespace lattices { double totalfwscore = 0.0f; if (!parallelstate->emulation) { - - fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", (int)batchsizeforward.size(), (int)batchsizebackward.size()); + if (verbosity>=2) + fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", (int)batchsizeforward.size(), (int)batchsizebackward.size()); const bool allocateframescorrect = (returnEframescorrect || boostingfactor != 0.0f); const bool copyuids = (returnEframescorrect || boostingfactor != 0.0f); From e3757f0054b59a907bdace1fe8140a8ee0749c21 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Mon, 4 Jan 2016 12:26:53 -0800 Subject: [PATCH 13/49] A stopgap to prevent reader to load matrices inconsistent with lattices. Will be gone once the bug is fixed. --- Source/Readers/HTKMLFReader/HTKMLFReader.cpp | 94 ++++++++++++-------- Source/Readers/HTKMLFReader/HTKMLFReader.h | 3 + 2 files changed, 59 insertions(+), 38 deletions(-) diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp index 344883db85be..239409bb6528 100644 --- a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp +++ b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp @@ -944,6 +944,23 @@ namespace Microsoft { namespace MSR { namespace CNTK { { if (!skip) { + // a stopgap + if (m_numFramesToProcess[i] > 0 && m_latticeBufferMultiUtt[i] && m_latticeBufferMultiUtt[i]->getnumframes() != m_numFramesToProcess[i]) + { + // BUGBUG: we just found that (due to some bugs yet to be tracked down), + // the filled number of frames is inconsistent with the number frames in lattices (though it rarely occurs) + // This is just a stopgap, to be removed after the bugs are found and fixed + bool needRenew = true; + while (needRenew) + { + size_t framenum = m_numFramesToProcess[i]; + fprintf(stderr, "WARNING: mismatched number of frames filled in the reader: %d in data vs %d in lattices. Ignoring this utterance %ls\n", + framenum, m_latticeBufferMultiUtt[i]->getnumframes(), m_latticeBufferMultiUtt[i]->getkey().c_str()); + ReNewBufferForMultiIO(i); + needRenew = m_numFramesToProcess[i] > 0 && m_latticeBufferMultiUtt[i] && m_latticeBufferMultiUtt[i]->getnumframes() != m_numFramesToProcess[i]; + } + + } m_numValidFrames[i] = m_numFramesToProcess[i]; if (m_numValidFrames[i] > 0) { @@ -975,49 +992,50 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_extraNumSeqs = 0; if (!m_frameMode) { - // insert extra utterances to parallel sequences that have enough space left - // As long as there is a gap at the end of any parallel sequence that is large enough for another utterance, fill it in. - size_t nextMinibatchUttnum = 0; - bool inserted; - // The next utterances have already been prepared under parallel-sequence indices [i], in prep for the next MB. - // For each, we will go through all parallel sequences [j] to see whether the entry currently held for the next [i] fits into [j]. - for (size_t i = 0; i < m_numSeqsPerMB; i++) + for (size_t src = 0; src < m_numSeqsPerMB; ) { - while (nextMinibatchUttnum <= i) + size_t framenum = m_numFramesToProcess[src]; + if (framenum == 0) { - size_t framenum = m_numFramesToProcess[i]; - inserted = false; - if (framenum > 0) // non-empty entry: see were it fits - { - // greedily search for a parallel sequence with enough space at the end to insert this utterance - for (size_t j = 0; j < m_numSeqsPerMB; j++) + src++; + continue; + } + if (m_latticeBufferMultiUtt[src]!=nullptr && m_latticeBufferMultiUtt[src]->getnumframes()!=framenum) + { + // BUGBUG: we just found that (due to some bugs yet to be tracked down), + // the filled number of frames is inconsistent with the number frames in lattices (though it rarely occurs) + // This is just a stopgap, to be removed after the bugs are found and fixed + fprintf(stderr, "WARNING: mismatched number of frames filled in the reader: %d in data vs %d in lattices. Ignoring this utterance %ls\n", + framenum, m_latticeBufferMultiUtt[src]->getnumframes(), m_latticeBufferMultiUtt[src]->getkey().c_str()); + src++; + continue; + } + + bool slotFound = false; + for (size_t des = 0; des < m_numSeqsPerMB; des++) // try to found a slot + { + if (framenum + m_numValidFrames[des] < m_mbNumTimeSteps) + { // found ! + m_extraSeqsPerMB.push_back(des); + if (m_latticeBufferMultiUtt[src] != nullptr) { - if (framenum + m_numValidFrames[j] < m_mbNumTimeSteps) - { - // enough space: insert it as parallel sequence [j] (instead of [i] in the next MB) - m_extraSeqsPerMB.push_back(j); - if (m_latticeBufferMultiUtt[i] != nullptr) - { - m_extraLatticeBufferMultiUtt.push_back(m_latticeBufferMultiUtt[i]); - m_extraLabelsIDBufferMultiUtt.push_back(m_labelsIDBufferMultiUtt[i]); - m_extraPhoneboundaryIDBufferMultiUtt.push_back(m_phoneboundaryIDBufferMultiUtt[i]); - } - fillOneUttDataforParallelmode(matrices, m_numValidFrames[j], framenum, j, i); - m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, j, m_numValidFrames[j], m_numValidFrames[j] + framenum); - - // consume it - ReNewBufferForMultiIO(i); // replace current [i] with a new one; then try again with this new one at [i] - m_numValidFrames[j] += framenum; - m_extraNumSeqs++; - inserted = true; - break; - } + m_extraLatticeBufferMultiUtt.push_back(m_latticeBufferMultiUtt[src]); + m_extraLabelsIDBufferMultiUtt.push_back(m_labelsIDBufferMultiUtt[src]); + m_extraPhoneboundaryIDBufferMultiUtt.push_back(m_phoneboundaryIDBufferMultiUtt[src]); } + fillOneUttDataforParallelmode(matrices, m_numValidFrames[des], framenum, des, src); + m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, des, m_numValidFrames[des], m_numValidFrames[des] + framenum); + + ReNewBufferForMultiIO(src); + m_numValidFrames[des] += framenum; + m_extraNumSeqs++; + slotFound = true; + break; } - if (!inserted) - { - nextMinibatchUttnum++; // didn't fit anywhere: done with entry [i] - } + } + if (!slotFound) + { + src++; // done with this source; try next source; } } diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.h b/Source/Readers/HTKMLFReader/HTKMLFReader.h index fd6015c28383..7e64ee3e8f5d 100644 --- a/Source/Readers/HTKMLFReader/HTKMLFReader.h +++ b/Source/Readers/HTKMLFReader/HTKMLFReader.h @@ -32,6 +32,9 @@ class HTKMLFReader : public IDataReader intargvector m_numSeqsPerMBForAllEpochs; size_t m_numSeqsPerMB; // requested number of parallel sequences size_t m_mbNumTimeSteps; // number of time steps to fill/filled (note: for frame randomization, this the #frames, and not 1 as later reported) + size_t m_mbMaxNumTimeSteps; // max time steps we take in a MB layout; any setence longer than this max will be discarded (and a warning will be issued ) + // this is used to prevent CUDA out-of memory errors + vector m_numFramesToProcess; // [seq index] number of frames available (left to return) in each parallel sequence vector m_switchFrame; /// TODO: something like the position where a new sequence starts; still supported? vector m_numValidFrames; // [seq index] valid #frames in each parallel sequence. Frames (s, t) with t >= m_numValidFrames[s] are NoInput. From 8f7f19333c7fdb2c38429fb94c1ab6f951e21866 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Fri, 8 Jan 2016 14:23:36 -0800 Subject: [PATCH 14/49] (make gcc happy) --- Source/Common/Include/Sequences.h | 2 +- Source/ComputationNetworkLib/InputAndParamNodes.h | 6 +++--- Source/Readers/HTKMLFReader/HTKMLFReader.cpp | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h index 0b9824e84233..9b92e71d61bf 100644 --- a/Source/Common/Include/Sequences.h +++ b/Source/Common/Include/Sequences.h @@ -90,7 +90,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // construction // ------------------------------------------------------------------- - MBLayout(size_t numParallelSequences, size_t numTimeSteps) : m_distanceToStart(CPUDEVICE), m_distanceToEnd(CPUDEVICE) { Init(numParallelSequences, numTimeSteps); } + MBLayout(size_t numParallelSequences, size_t numTimeSteps) : m_columnsValidityMask(CPUDEVICE), m_distanceToStart(CPUDEVICE), m_distanceToEnd(CPUDEVICE) { Init(numParallelSequences, numTimeSteps); } MBLayout() : MBLayout(1, 0) { } // copy the content of another MBLayoutPtr over diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index 125e572a22fc..4e86ebbb72e9 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -40,14 +40,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { Base(deviceId, name) { m_parameterUpdateRequired = true; - m_valueSharable = false; + this->m_valueSharable = false; m_sampleLayout = ImageLayoutWHC(1, SIZE_MAX, 1); } LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, size_t rows, size_t cols) : Base(deviceId, name) { m_parameterUpdateRequired = true; - m_valueSharable = false; + this->m_valueSharable = false; m_sampleLayout = ImageLayoutWHC(1, rows, 1); // TODO: Is ^^ this a wise choice? These are often weight matrices, where rows, not columns, are multiplied with input vectors. CreateMatrixIfNull(m_value); @@ -245,7 +245,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { SetDims(rows, cols); UpdateFunctionValuesSize(); // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that) m_parameterUpdateRequired = false; - m_valueSharable = false; + this->m_valueSharable = false; } protected: InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, bool isSparse) : diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp index 239409bb6528..ecc6283f615d 100644 --- a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp +++ b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp @@ -264,7 +264,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { expand_wildcards(thisLattice(L"numLatTocFile"), paths); latticetocs.first.insert(latticetocs.first.end(), paths.begin(), paths.end()); } - RootPathInLatticeTocs = thisLattice(L"prefixPathInToc",L""); + RootPathInLatticeTocs =(wstring) thisLattice(L"prefixPathInToc",L""); } //get HMM related file names @@ -955,7 +955,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { size_t framenum = m_numFramesToProcess[i]; fprintf(stderr, "WARNING: mismatched number of frames filled in the reader: %d in data vs %d in lattices. Ignoring this utterance %ls\n", - framenum, m_latticeBufferMultiUtt[i]->getnumframes(), m_latticeBufferMultiUtt[i]->getkey().c_str()); + (int)framenum, (int)m_latticeBufferMultiUtt[i]->getnumframes(), m_latticeBufferMultiUtt[i]->getkey().c_str()); ReNewBufferForMultiIO(i); needRenew = m_numFramesToProcess[i] > 0 && m_latticeBufferMultiUtt[i] && m_latticeBufferMultiUtt[i]->getnumframes() != m_numFramesToProcess[i]; } @@ -1006,7 +1006,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // the filled number of frames is inconsistent with the number frames in lattices (though it rarely occurs) // This is just a stopgap, to be removed after the bugs are found and fixed fprintf(stderr, "WARNING: mismatched number of frames filled in the reader: %d in data vs %d in lattices. Ignoring this utterance %ls\n", - framenum, m_latticeBufferMultiUtt[src]->getnumframes(), m_latticeBufferMultiUtt[src]->getkey().c_str()); + (int)framenum, (int)m_latticeBufferMultiUtt[src]->getnumframes(), m_latticeBufferMultiUtt[src]->getkey().c_str()); src++; continue; } From fc3361438fc81add21110cf38182241073c302a6 Mon Sep 17 00:00:00 2001 From: RuiZhao Date: Fri, 18 Dec 2015 15:17:48 -0800 Subject: [PATCH 15/49] frameskip SE --- Source/Math/latticefunctionskernels.h | 25 +++++++++++++++++-- Source/SequenceTrainingLib/gammacalculation.h | 4 +-- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/Source/Math/latticefunctionskernels.h b/Source/Math/latticefunctionskernels.h index 876e3c6a867e..5053b271690b 100644 --- a/Source/Math/latticefunctionskernels.h +++ b/Source/Math/latticefunctionskernels.h @@ -358,6 +358,7 @@ struct latticefunctionskernels size_t state1step0to1 = te; // inflection point from state 0 to 1, record in state 1 size_t state2step0to1 = te; // inflection point from state 0 to 1, record in state 2 size_t state2step1to2 = te; // inflection point from state 1 to 2, record in state 2 + size_t state2step0to2 = te; //now we only support transition from -1 to 0 or 2 for sil float pathscore0 = fwscore ; // log pp in state 0 @@ -400,16 +401,20 @@ struct latticefunctionskernels pathscore2 = pathscore12; state2step0to1 = state1step0to1; // record the inflection point state2step1to2 = t; // record the inflection point + state2step0to2 = te; if (isSil) backptrmatrix (2, t-ts-1) = 1; } - if (isSil) // only silence have path from 0 to 2 + //if (isSil) // only silence have path from 0 to 2 { const float pathscore02 = pathscore0 + getlogtransp(transP,0,2); // log pp from state 0 to 2 if (pathscore02 >= pathscore2) // if state 0->2 { pathscore2 = pathscore02; - backptrmatrix (2, t-ts-1) = 0; + if (isSil) + backptrmatrix (2, t-ts-1) = 0; + state2step0to2 = t; + state2step1to2 = te; } } @@ -494,6 +499,21 @@ struct latticefunctionskernels // emit alignment if (!isSil) + { + if (state2step0to2 < te) + { + state2step0to2 += alignindex - ts; + for (size_t t = alignindex; t < alignindex + numframes; t++) // set the final alignment + { + size_t senoneid; + if (t < state2step0to2) // in state 0 + senoneid = senoneid0; + else // in state 2 + senoneid = senoneid2; + alignresult[t] = (unsigned short)senoneid; + } + } + else { state2step0to1 += alignindex - ts; // convert to align measure state2step1to2 += alignindex - ts; @@ -509,6 +529,7 @@ struct latticefunctionskernels alignresult[t] = (unsigned short) senoneid; } } + } else // for silence { size_t lastpointer = 2; diff --git a/Source/SequenceTrainingLib/gammacalculation.h b/Source/SequenceTrainingLib/gammacalculation.h index 4ad7d8f46c63..f8a60e5bd478 100644 --- a/Source/SequenceTrainingLib/gammacalculation.h +++ b/Source/SequenceTrainingLib/gammacalculation.h @@ -19,9 +19,9 @@ namespace msra { namespace lattices { GammaCalculation() : cpumode(false) { initialmark = false; - lmf = 14.0f; // Note that 9 was best for Fisher --these should best be configurable + lmf = 7.0f; // Note that 9 was best for Fisher --these should best be configurable wp = 0.0f; - amf = 14.0f; + amf = 7.0f; boostmmifactor = 0.0f; seqsMBRmode = false; } From ef84011f1619a8b0be24a3debc7d43d312c74100 Mon Sep 17 00:00:00 2001 From: RuiZhao Date: Wed, 23 Dec 2015 11:39:39 -0800 Subject: [PATCH 16/49] SE frameskip V2 temp --- Source/Math/latticefunctionskernels.h | 65 ++++++++++++------- .../latticeforwardbackward.cpp | 3 +- 2 files changed, 42 insertions(+), 26 deletions(-) diff --git a/Source/Math/latticefunctionskernels.h b/Source/Math/latticefunctionskernels.h index 5053b271690b..ec164960e1b5 100644 --- a/Source/Math/latticefunctionskernels.h +++ b/Source/Math/latticefunctionskernels.h @@ -356,27 +356,39 @@ struct latticefunctionskernels const size_t te = ts + numframes; // end time of current unit size_t state1step0to1 = te; // inflection point from state 0 to 1, record in state 1 + size_t state1stepm1to1 = te; size_t state2step0to1 = te; // inflection point from state 0 to 1, record in state 2 + size_t state2stepm1to1 = te; // inflection point from state 0 to 1, record in state 2 size_t state2step1to2 = te; // inflection point from state 1 to 2, record in state 2 size_t state2step0to2 = te; //now we only support transition from -1 to 0 or 2 for sil - float pathscore0 = fwscore ; // log pp in state 0 - float pathscore1 = LOGZERO; // log pp in state 1 - float pathscore2 = LOGZERO; // log pp in state 2 - if(isSil) - pathscore2 = fwscore; + float pathscore0 = fwscore; // log pp in state 0 + float pathscore1 = fwscore; // log pp in state 1 + float pathscore2 = fwscore; // log pp in state 2 + + // first frame if (ts != te) // for t = ts, initialization { - if (isSil) //for sil, -1 to 2 and -1 to 0 is permitted + /* if (isSil) //for sil, -1 to 2 and -1 to 0 is permitted { pathscore0 += getlogtransp(transP,-1,0) + logLLs(senoneid0,ts); pathscore2 += getlogtransp(transP,-1,2) + logLLs(senoneid2,ts); } - else //for others, only -1 to 0 is permitted - pathscore0 += logLLs(senoneid0,ts); // Note: no need to incorporate LLs for state [1] and [2] because the path log LLs are LOGZERO anyway + else //for others, only -1 to 0 is permitted + { + pathscore0 += getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts); + pathscore1 += getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts); + + }*/ + pathscore2 = getlogtransp(transP, -1, 2) + logLLs(senoneid2, ts); + pathscore1 = getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts); + state1stepm1to1 = ts; + pathscore0 = getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts); + + } @@ -400,6 +412,7 @@ struct latticefunctionskernels { pathscore2 = pathscore12; state2step0to1 = state1step0to1; // record the inflection point + state2stepm1to1 = state1stepm1to1; state2step1to2 = t; // record the inflection point state2step0to2 = te; if (isSil) @@ -427,9 +440,11 @@ struct latticefunctionskernels { pathscore1 = pathscore01; state1step0to1 = t; // record the inflection point + state1stepm1to1 = te; if (isSil) backptrmatrix (1, t-ts-1) = 0; } + if (isSil) // only silence have path from 2 to 1 { const float pathscore21 = pathscore2last + getlogtransp(transP,2,1); @@ -500,7 +515,7 @@ struct latticefunctionskernels if (!isSil) { - if (state2step0to2 < te) + if (state2step0to2 < te) //from 0 to 2 { state2step0to2 += alignindex - ts; for (size_t t = alignindex; t < alignindex + numframes; t++) // set the final alignment @@ -513,22 +528,22 @@ struct latticefunctionskernels alignresult[t] = (unsigned short)senoneid; } } - else - { - state2step0to1 += alignindex - ts; // convert to align measure - state2step1to2 += alignindex - ts; - for (size_t t = alignindex; t < alignindex + numframes; t++) // set the final alignment - { - size_t senoneid; - if (t < state2step0to1) // in state 0 - senoneid = senoneid0; - else if(t < state2step1to2) // in state 1 - senoneid = senoneid1; - else // in state 2 - senoneid = senoneid2; - alignresult[t] = (unsigned short) senoneid; - } - } + else //from 1 to 2 + { + state2step0to1 += alignindex - ts; // convert to align measure + state2step1to2 += alignindex - ts; + for (size_t t = alignindex; t < alignindex + numframes; t++) // set the final alignment + { + size_t senoneid; + if (state2step0to1 < te && t < state2step0to1) + senoneid = senoneid0; + else if(t < state2step1to2) // in state 1 + senoneid = senoneid1; + else // in state 2 + senoneid = senoneid2; + alignresult[t] = (unsigned short) senoneid; + } + } } else // for silence { diff --git a/Source/SequenceTrainingLib/latticeforwardbackward.cpp b/Source/SequenceTrainingLib/latticeforwardbackward.cpp index 0f55d5ba0ff9..942f6adffa8d 100644 --- a/Source/SequenceTrainingLib/latticeforwardbackward.cpp +++ b/Source/SequenceTrainingLib/latticeforwardbackward.cpp @@ -438,6 +438,7 @@ template static bool islogzero (FLOAT v) { return v < LOGZERO/2; LogicError("invalid backpointer resulting in state index out of range"); int bp = (int) backpointers(j,t); // save the backpointer before overwriting it (gammas and backpointers are aliases of each other) + thisedgealignmentsj[t] = (unsigned short)hmm.getsenoneid(j - js); if (!returnsenoneids) // return binary gammas (for MMI; this mode is compatible with softalignmode) for (size_t i = js; i < je; i++) loggammas(i,t) = ((int) i == j) ? 0.0f : LOGZERO; @@ -784,7 +785,7 @@ void lattice::forwardbackwardalign (parallelstate & parallelstate, // - per-edge acoustic scores const size_t silunitid = hset.gethmmid("sil"); // shall be the same as parallelstate.getsilunitid() bool parallelsil = true; - bool cpuverification = false; + bool cpuverification = true; #ifndef PARALLEL_SIL // we use a define to make this marked parallelsil = false; From 1ad6c2d4192d702309b982d858f7f91d86be9a04 Mon Sep 17 00:00:00 2001 From: RuiZhao Date: Thu, 24 Dec 2015 13:48:21 -0800 Subject: [PATCH 17/49] frameskipv2 --- Source/Math/latticefunctionskernels.h | 8 ++++---- Source/SequenceTrainingLib/latticeforwardbackward.cpp | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Source/Math/latticefunctionskernels.h b/Source/Math/latticefunctionskernels.h index ec164960e1b5..e30f35582236 100644 --- a/Source/Math/latticefunctionskernels.h +++ b/Source/Math/latticefunctionskernels.h @@ -383,10 +383,10 @@ struct latticefunctionskernels pathscore1 += getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts); }*/ - pathscore2 = getlogtransp(transP, -1, 2) + logLLs(senoneid2, ts); - pathscore1 = getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts); + pathscore2 += getlogtransp(transP, -1, 2) + logLLs(senoneid2, ts); + pathscore1 += getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts); state1stepm1to1 = ts; - pathscore0 = getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts); + pathscore0 += getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts); } @@ -535,7 +535,7 @@ struct latticefunctionskernels for (size_t t = alignindex; t < alignindex + numframes; t++) // set the final alignment { size_t senoneid; - if (state2step0to1 < te && t < state2step0to1) + if (state2step0to1 static bool islogzero (FLOAT v) { return v < LOGZERO/2; LogicError("invalid backpointer resulting in state index out of range"); int bp = (int) backpointers(j,t); // save the backpointer before overwriting it (gammas and backpointers are aliases of each other) - thisedgealignmentsj[t] = (unsigned short)hmm.getsenoneid(j - js); + //thisedgealignmentsj[t] = (unsigned short)hmm.getsenoneid(j - js); if (!returnsenoneids) // return binary gammas (for MMI; this mode is compatible with softalignmode) for (size_t i = js; i < je; i++) loggammas(i,t) = ((int) i == j) ? 0.0f : LOGZERO; @@ -785,7 +785,7 @@ void lattice::forwardbackwardalign (parallelstate & parallelstate, // - per-edge acoustic scores const size_t silunitid = hset.gethmmid("sil"); // shall be the same as parallelstate.getsilunitid() bool parallelsil = true; - bool cpuverification = true; + bool cpuverification = false; #ifndef PARALLEL_SIL // we use a define to make this marked parallelsil = false; From a1427cc3cfc026c1171bc4020896ca1e8279411b Mon Sep 17 00:00:00 2001 From: RuiZhao Date: Tue, 29 Dec 2015 15:00:09 -0800 Subject: [PATCH 18/49] release temp matrix in SE --- .../TrainingCriterionNodes.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/Source/ComputationNetworkLib/TrainingCriterionNodes.h b/Source/ComputationNetworkLib/TrainingCriterionNodes.h index a34a92c66c04..361e57472f4b 100644 --- a/Source/ComputationNetworkLib/TrainingCriterionNodes.h +++ b/Source/ComputationNetworkLib/TrainingCriterionNodes.h @@ -1292,8 +1292,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { } else if (inputIndex == 1) { - BackpropToRight(*m_softmaxOfRight, Input(0)->Value(), Input(inputIndex)->Gradient(), - Gradient(), *m_gammaFromLattice, m_fsSmoothingWeight, m_frameDropThreshold); + FrameRange fr(Input(0)->GetMBLayout()); + BackpropToRight(*m_softmaxOfRight, Input(0)->Value(), Input(inputIndex)->Gradient(), + Gradient(), *m_gammaFromLattice, m_fsSmoothingWeight, m_frameDropThreshold); + MaskMissingColumnsToZero(Input(inputIndex)->Gradient(), Input(0)->GetMBLayout(), fr); + #ifdef _DEBUG Input(inputIndex)->InvalidateMissingGradientColumns(FrameRange(Input(inputIndex)->GetMBLayout())); #endif @@ -1433,6 +1436,16 @@ namespace Microsoft { namespace MSR { namespace CNTK { RequestMatrixFromPool(m_softmaxOfRight, matrixPool); RequestMatrixFromPool(m_gammaFromLattice, matrixPool); } + + //request matrices needed to do node function value evaluation + virtual void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool) + { + Base::ReleaseMatricesAfterForwardProp(matrixPool); + ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool); + ReleaseMatrixToPool(m_softmaxOfRight, matrixPool); + ReleaseMatrixToPool(m_gammaFromLattice, matrixPool); + } + // TODO: method names should be CamelCase std::vector> * getLatticePtr() { From 5499741fde4789a11bd93f148b2f37b955f52b48 Mon Sep 17 00:00:00 2001 From: RuiZhao Date: Wed, 6 Jan 2016 16:48:10 -0800 Subject: [PATCH 19/49] release after BP --- Source/ComputationNetworkLib/TrainingCriterionNodes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/ComputationNetworkLib/TrainingCriterionNodes.h b/Source/ComputationNetworkLib/TrainingCriterionNodes.h index 361e57472f4b..07190640cb70 100644 --- a/Source/ComputationNetworkLib/TrainingCriterionNodes.h +++ b/Source/ComputationNetworkLib/TrainingCriterionNodes.h @@ -1438,7 +1438,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } //request matrices needed to do node function value evaluation - virtual void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool) + virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) { Base::ReleaseMatricesAfterForwardProp(matrixPool); ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool); From c41eafa4f8c09ec6d9dad91be350dd65d7598464 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Sat, 9 Jan 2016 23:16:45 -0800 Subject: [PATCH 20/49] Move all data member in MBLayout to CPU --- Source/Common/Include/Sequences.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h index 0b9824e84233..9b92e71d61bf 100644 --- a/Source/Common/Include/Sequences.h +++ b/Source/Common/Include/Sequences.h @@ -90,7 +90,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // construction // ------------------------------------------------------------------- - MBLayout(size_t numParallelSequences, size_t numTimeSteps) : m_distanceToStart(CPUDEVICE), m_distanceToEnd(CPUDEVICE) { Init(numParallelSequences, numTimeSteps); } + MBLayout(size_t numParallelSequences, size_t numTimeSteps) : m_columnsValidityMask(CPUDEVICE), m_distanceToStart(CPUDEVICE), m_distanceToEnd(CPUDEVICE) { Init(numParallelSequences, numTimeSteps); } MBLayout() : MBLayout(1, 0) { } // copy the content of another MBLayoutPtr over From 05e0262bf1f305d9fb19da9082acdc8ba8307f08 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Sat, 9 Jan 2016 23:17:51 -0800 Subject: [PATCH 21/49] Bug fix for ConvertDBN command --- Source/CNTK/SimpleNetworkBuilder.cpp | 8 ++++---- Source/ComputationNetworkLib/CompositeComputationNodes.h | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Source/CNTK/SimpleNetworkBuilder.cpp b/Source/CNTK/SimpleNetworkBuilder.cpp index 726c4a86220d..4e1ff488722f 100644 --- a/Source/CNTK/SimpleNetworkBuilder.cpp +++ b/Source/CNTK/SimpleNetworkBuilder.cpp @@ -2419,9 +2419,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { Matrix priorVals = ReadMatrixFromDbnFile(fstream, std::string("Pu")); assert(priorVals.GetNumCols() == 1 && priorVals.GetNumRows() == m_outputLayerSize); - w = builder.Mean(label, L"Prior"); - static_pointer_cast>(w)->SideLoadFromMatrix(priorVals); - w->SetParameterUpdateRequired(false); + prior = builder.Mean(label, L"Prior"); + static_pointer_cast>(prior)->SideLoadFromMatrix(priorVals); + prior->SetParameterUpdateRequired(false); } else // pretrained network - need to add output layer, initalize { @@ -2461,7 +2461,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (layerType == "perceptron" || m_needPrior) { - input = builder.Log(pcNodePtr, L"LogOfPrior"); + input = builder.Log(prior, L"LogOfPrior"); //following two lines is needed only if true probability is needed //output = builder.Softmax(output); diff --git a/Source/ComputationNetworkLib/CompositeComputationNodes.h b/Source/ComputationNetworkLib/CompositeComputationNodes.h index 6d983a9784fe..9bdd6f38ce56 100644 --- a/Source/ComputationNetworkLib/CompositeComputationNodes.h +++ b/Source/ComputationNetworkLib/CompositeComputationNodes.h @@ -295,6 +295,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { CreateMatrixIfNull(m_value); m_value->SetValue(value); m_hasComputed = true; + SetDims(value.GetNumRows(), value.GetNumCols()); } public: bool m_hasComputed; From 569a4d6c21bed92b688910eab853df5daf295f18 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Mon, 11 Jan 2016 16:32:19 -0800 Subject: [PATCH 22/49] Add support for revising batch normalization property in MEL. Now a BatchNormalization node's eval mode can be modified by SetProperty(BNnode, batchNormEvalMode, true); or by SetPropertyForSubTree(rootNode, batchNormEvalMode, true); in which all the BN nodes under rootNode will be changed. --- Source/CNTK/ModelEditLanguage.cpp | 45 ++++++++++++++++++- .../ComputationNetwork.h | 3 +- .../ComputationNetworkEditing.cpp | 38 ++++++++++++++++ .../ConvolutionalNodes.h | 4 ++ Source/Math/latticefunctionskernels.h | 10 ++--- 5 files changed, 93 insertions(+), 7 deletions(-) diff --git a/Source/CNTK/ModelEditLanguage.cpp b/Source/CNTK/ModelEditLanguage.cpp index 981b63ffd53f..94a1dc185fb7 100644 --- a/Source/CNTK/ModelEditLanguage.cpp +++ b/Source/CNTK/ModelEditLanguage.cpp @@ -9,6 +9,7 @@ #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings #include "ModelEditLanguage.h" +#include "ConvolutionalNodes.h" #include namespace Microsoft { namespace MSR { namespace CNTK { @@ -56,7 +57,8 @@ enum MELProperty melPropFinalCriterion, melPropEvaluation, melPropOutput, - melPropRecurrent + melPropRecurrent, + melPropBatchNormMode, }; // SetProperty - Set the Property on the passed node @@ -420,6 +422,10 @@ void MELScript::CallFunction(const std::string& p_name, const ConfigPa { prop = melPropEvaluation; } + else if (EqualInsensitive(propName, "batchNormEvalMode")) + { + prop = melPropBatchNormMode; + } else if (EqualInsensitive(propName, "output")) { prop = melPropOutput; @@ -485,6 +491,33 @@ void MELScript::CallFunction(const std::string& p_name, const ConfigPa // what to do here? break; } + case melPropBatchNormMode: + { + if (node->OperationName() != OperationNameOf(BatchNormalizationNode)) + { + RuntimeError("Invalid node type: node %ls (type:%ls) is not a %ls node; therefore cannot apply batchNormEvalMode on it.", + node->NodeName().c_str(), + node->OperationName().c_str(), + OperationNameOf(BatchNormalizationNode).c_str()); + } + bool property = params[2]; + auto pnode = dynamic_pointer_cast>(node); + if (pnode) + { + pnode->SetEvalMode(property); + } + else + { + auto pnode2 = dynamic_pointer_cast>(node); + if (pnode2) + pnode2->SetEvalMode(property); + else + { + RuntimeError("Invalid node type: node name=%ls. We assume either BatchNormalizationNode or BatchNormalizationNode\n", node->NodeName().c_str()); + } + } + break; + } default: { RuntimeError("Invalid property, %s, is not supported", propName.c_str()); @@ -505,6 +538,10 @@ void MELScript::CallFunction(const std::string& p_name, const ConfigPa { prop = melPropComputeGradient; } + if (EqualInsensitive(propName, "batchNormEvalMode")) + { + prop = melPropBatchNormMode; + } else { RuntimeError("Invalid property, %s, is not supported", propName.c_str()); @@ -527,6 +564,12 @@ void MELScript::CallFunction(const std::string& p_name, const ConfigPa netNdl->cn->SetLearnableNodesBelowNeedGradient(needGradient, node); break; } + case melPropBatchNormMode: + { + bool evalMode = params[2]; + netNdl->cn->SetBatchNormlizationNodesBelowEvalMode(evalMode, node); + break; + } default: { RuntimeError("Invalid property, %s, is not supported", propName.c_str()); diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h index 5b3a0f16adf5..8d6d5195ec65 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.h +++ b/Source/ComputationNetworkLib/ComputationNetwork.h @@ -346,7 +346,8 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb void ReplaceFinalCriterionNode(wstring oldNodeName, ComputationNodeBasePtr newNode); void AddFeatureNode(ComputationNodeBasePtr featureNode); void RemoveFeatureNode(ComputationNodeBasePtr featureNode); - void SetLearnableNodesBelowNeedGradient(const bool needGradient, const ComputationNodeBasePtr& rootNode = nullptr); + void SetLearnableNodesBelowNeedGradient(const bool needGradient, const ComputationNodeBasePtr& rootNode = nullptr); + void SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode = nullptr); // ----------------------------------------------------------------------- // node access diff --git a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp index bdd1a063114d..a44e268c8907 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp @@ -10,6 +10,7 @@ #include "ComputationNode.h" #include "ComputationNetwork.h" #include "InputAndParamNodes.h" +#include "ConvolutionalNodes.h" #include #include #include @@ -314,4 +315,41 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } + void ComputationNetwork::SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode /* = nullptr */) + { + vector nodes; + if (rootNode == nullptr) + { + for (auto pair : m_nameToNodeMap) + { + nodes.push_back(pair.second); + } + } + else + { + auto allnodes = rootNode->EnumerateNodes(true); + for (auto node : allnodes) + nodes.push_back(node); + } + + for (auto& node: nodes) + { + if (node->OperationName() == OperationNameOf(BatchNormalizationNode)) + { + auto pNode = dynamic_pointer_cast>(node); + if (!pNode) + { + auto pNode2= dynamic_pointer_cast>(node); + if (!pNode2) + { + RuntimeError("Invalid node type: node name=%ls. We assume either BatchNormalizationNode or BatchNormalizationNode\n", node->NodeName().c_str()); + } + } + else + { + pNode->SetEvalMode(evalMode); + } + } + } + } }}} diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h index 8545f32f5e4d..45031d4722d1 100644 --- a/Source/ComputationNetworkLib/ConvolutionalNodes.h +++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h @@ -740,6 +740,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } + void SetEvalMode(bool bnEvalMode) + { + m_eval = bnEvalMode; + } private: struct VersionInfo { diff --git a/Source/Math/latticefunctionskernels.h b/Source/Math/latticefunctionskernels.h index e30f35582236..b2b7d4b08297 100644 --- a/Source/Math/latticefunctionskernels.h +++ b/Source/Math/latticefunctionskernels.h @@ -356,9 +356,9 @@ struct latticefunctionskernels const size_t te = ts + numframes; // end time of current unit size_t state1step0to1 = te; // inflection point from state 0 to 1, record in state 1 - size_t state1stepm1to1 = te; + //size_t state1stepm1to1 = te; size_t state2step0to1 = te; // inflection point from state 0 to 1, record in state 2 - size_t state2stepm1to1 = te; // inflection point from state 0 to 1, record in state 2 + //size_t state2stepm1to1 = te; // inflection point from state 0 to 1, record in state 2 size_t state2step1to2 = te; // inflection point from state 1 to 2, record in state 2 size_t state2step0to2 = te; @@ -385,7 +385,7 @@ struct latticefunctionskernels }*/ pathscore2 += getlogtransp(transP, -1, 2) + logLLs(senoneid2, ts); pathscore1 += getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts); - state1stepm1to1 = ts; + //state1stepm1to1 = ts; pathscore0 += getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts); @@ -412,7 +412,7 @@ struct latticefunctionskernels { pathscore2 = pathscore12; state2step0to1 = state1step0to1; // record the inflection point - state2stepm1to1 = state1stepm1to1; + //state2stepm1to1 = state1stepm1to1; state2step1to2 = t; // record the inflection point state2step0to2 = te; if (isSil) @@ -440,7 +440,7 @@ struct latticefunctionskernels { pathscore1 = pathscore01; state1step0to1 = t; // record the inflection point - state1stepm1to1 = te; + //state1stepm1to1 = te; if (isSil) backptrmatrix (1, t-ts-1) = 0; } From edbb47dc79d6dd8ed844f20ffa13be45c3d374c7 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Mon, 14 Dec 2015 17:33:58 -0800 Subject: [PATCH 23/49] Replace CreateMatrixIfNull by MarkValueNonsharable() In the compiling the stage, we will mark nodes as nonsharable whose descendents are all learnable parameters. --- .../CompositeComputationNodes.h | 6 ++-- .../ComputationNetwork.h | 1 + .../ComputationNetworkEvaluation.cpp | 29 +++++++++++++++++++ .../ComputationNetworkLib/ComputationNode.h | 23 +++++++++++++-- Source/ComputationNetworkLib/EsotericNodes.h | 3 +- .../InputAndParamNodes.h | 3 +- 6 files changed, 59 insertions(+), 6 deletions(-) diff --git a/Source/ComputationNetworkLib/CompositeComputationNodes.h b/Source/ComputationNetworkLib/CompositeComputationNodes.h index f8f79dc21642..5027ef6c2bd9 100644 --- a/Source/ComputationNetworkLib/CompositeComputationNodes.h +++ b/Source/ComputationNetworkLib/CompositeComputationNodes.h @@ -234,7 +234,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { virtual void MarkComputed(const bool hasComputed) { m_hasComputed = hasComputed; - CreateMatrixIfNull(m_value); + // CreateMatrixIfNull(m_value); + MarkValueNonSharable(); } virtual bool RequiresPreCompute() const override { return true; } @@ -293,7 +294,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // this is for the special case: convertDBN needs this; because we initialize values directly from another well-trained model virtual void SideLoadFromMatrix(const Matrix& value) { - CreateMatrixIfNull(m_value); + //CreateMatrixIfNull(m_value); + MarkValueNonSharable(); m_value->SetValue(value); m_hasComputed = true; } diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h index 26b78d8be05f..b533602786c5 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.h +++ b/Source/ComputationNetworkLib/ComputationNetwork.h @@ -159,6 +159,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb private: void ValidateNodes(list nodes, bool isFinalValidationPass, size_t & todo); void ValidateSubNetwork(const ComputationNodeBasePtr& rootNode); + void MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode); private: void DetermineSetOfAllRoots(); void CollectInputAndLearnableParameters(const ComputationNodeBasePtr& rootNode); diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp index 084fe9ce9a69..0cdeec07f56e 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp @@ -413,6 +413,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { for (auto & node : m_allRoots) ValidateSubNetwork(node); + // STEP: mark non-sharable function values + // if all the descendants of a particular node are learnable parameters, + // its function value is not sharable + for (auto & node : m_allRoots) + MarkValueNonSharableNodes(node); + + // STEP: Optimize the network. // :) @@ -678,6 +685,28 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } + // mark nodes that are purely induced by parameters as non-sharable and create space for value if null + void ComputationNetwork::MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode) + { + const auto & nodes = GetEvalOrder(rootNode); + for (auto& node : nodes) + { + auto children = node->GetInputs(); + bool allChildrenNonSharable = true; + for (auto& child : children) + { + if (child->isValueSharable()) + { + allChildrenNonSharable = false; + break; + } + } + if (allChildrenNonSharable) + node->MarkValueNonSharable(); + } + + } + #if 0 // prepare to compute with the subnetwork that this rootNode depends on, including // - auto-detecting recurrent loops diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index b4e0725bd7c2..198c1dcb6421 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -250,7 +250,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_deviceId(deviceId), m_outputNeededDuringBackprop(true), m_parameterUpdateRequired(false), m_gradientInitialized(false), m_nodeName(name == L"" ? CreateUniqNodeName() : name), - m_numRows(0), m_numCols(0) + m_numRows(0), m_numCols(0), m_valueSharable(true) { } virtual ~ComputationNodeBase(){} @@ -455,6 +455,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { LogicError("VerifyNumParallelSequences: value inconsistent with MB layout"); } + bool isValueSharable() + { + return m_valueSharable; + } + virtual void MarkValueNonSharable() + { + m_valueSharable = false; + } protected: public: // ...the following should be protected, but nodes inquire about their children, requiring public access @@ -769,6 +777,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { bool m_parameterUpdateRequired; // update parameters? Only used for LearnableParameters. --TODO: Should we make this a member of LearnableParameters actually? And require a type cast? Currently it is read out for all leaves. bool m_gradientInitialized; // indicates whether the gradient matrix has been resized and initialized to 0 bool m_outputNeededDuringBackprop; // indicates whether the output value of the node is needed during backprop + + // flags related with sharable values + bool m_valueSharable; // whether value is sharable }; typedef ComputationNodeBase::ComputationNodeBasePtr ComputationNodeBasePtr; @@ -815,7 +826,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Since the dimensions are read as well, this function also updates m_numRows/m_numCols. void LoadValue(File& fstream) { - CreateMatrixIfNull(m_value); + // CreateMatrixIfNull(m_value); + MarkValueNonSharable(); fstream >> Value(); // above reads dimensions, so we must update our own m_numRows/m_numCols SetDims(TensorShape(Value().GetNumRows()), Value().GetNumCols()); @@ -1317,6 +1329,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { CreateMatrixIfNull(m_gradient); } + void MarkValueNonSharable() override + { + m_valueSharable = false; + CreateMatrixIfNull(m_value); + } + + protected: // this function is used to create matrices for those needed before matrix pool is available diff --git a/Source/ComputationNetworkLib/EsotericNodes.h b/Source/ComputationNetworkLib/EsotericNodes.h index 3f500421cf1d..b5c6a6f46ee5 100644 --- a/Source/ComputationNetworkLib/EsotericNodes.h +++ b/Source/ComputationNetworkLib/EsotericNodes.h @@ -1550,7 +1550,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { Base(deviceId, name) { Init(row_size, col_size); - CreateMatrixIfNull(m_gradient); + //CreateMatrixIfNull(m_gradient); + MarkValueNonSharable(); m_gradient->Resize(row_size, col_size); m_gradient->SetValue(0.0f); } diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index 898d35f9f65f..f7096fa34efa 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -254,7 +254,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { void Init(const TensorShape & sampleLayout, bool isSparse) { m_isSparse = isSparse; - CreateMatrixIfNull(m_value); + //CreateMatrixIfNull(m_value); + MarkValueNonSharable(); if (isSparse) ConvertToSparseMatrix(); From 485a7b8fe6056e4cbd4716bf06fabb9ce64241cb Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Wed, 16 Dec 2015 15:54:08 -0800 Subject: [PATCH 24/49] Revise the implementation of valueNotSharableNode. More to be revised. --- .../ComputationNetworkEvaluation.cpp | 47 ++++++++++++++++--- .../ComputationNetworkLib/ComputationNode.h | 24 +++++----- .../InputAndParamNodes.h | 2 + 3 files changed, 54 insertions(+), 19 deletions(-) diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp index 0cdeec07f56e..03d085be8d2b 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp @@ -15,6 +15,7 @@ #include #include #include +#include using namespace std; @@ -689,20 +690,52 @@ namespace Microsoft { namespace MSR { namespace CNTK { void ComputationNetwork::MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode) { const auto & nodes = GetEvalOrder(rootNode); + std::map allLeafDescendentsAreParameters; for (auto& node : nodes) { auto children = node->GetInputs(); - bool allChildrenNonSharable = true; - for (auto& child : children) + wstring myname = node->NodeName(); + bool allParameters = true; + + if (children.size()) // we don't do the check for leaf node, cause all the possible leaf nodes (input/parameters/precompute node) are marked as non-sharable already { - if (child->isValueSharable()) + for (auto child : children) { - allChildrenNonSharable = false; - break; + wstring ChildName = child->NodeName(); + if (allLeafDescendentsAreParameters.find(ChildName) == allLeafDescendentsAreParameters.end()) + { + // not found, means it is a leaf node (we are at eval order ) + assert(child->IsLeaf()); + if (node->isLearnableParameter()) + { + allLeafDescendentsAreParameters[ChildName] = true; + } + else + { + allParameters = false; + allLeafDescendentsAreParameters[ChildName] = false; + break; + } + } + else + { + if (allLeafDescendentsAreParameters[ChildName] == false) + { + allParameters = false; + break; + } + } + } + allLeafDescendentsAreParameters[myname] = allParameters; + if (allParameters) + { + node->MarkValueNonSharable(); + } + else + { + node->MarkValueSharable(); } } - if (allChildrenNonSharable) - node->MarkValueNonSharable(); } } diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 198c1dcb6421..8064bceb58f0 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -151,7 +151,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { friend class ComputationNetwork; ComputationNetworkOwnedNodeState() : - m_needsGradient(false) + m_needsGradient(false), m_valueSharable(true) { PurgeStateForFormingRecurrentLoops(); m_isPartOfLoop = false; @@ -166,10 +166,17 @@ namespace Microsoft { namespace MSR { namespace CNTK { bool IsPartOfLoop() const { return m_isPartOfLoop; } + virtual void MarkValueNonSharable(){ m_valueSharable = false; } + virtual void MarkValueSharable() { m_valueSharable = true; } + bool isValueSharable() { return m_valueSharable; } + protected: // TODO: should be fully encapsulated here bool m_needsGradient; // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree) + bool m_valueSharable; // a flag is needed for memory share. + // If it is false (e.g., learnableParameters/InputValue and those nodes are solely induced by learnableParameters), + // it will never be released to memory pool private: bool m_isPartOfLoop; // true if this loop is part of a recurrent loop @@ -250,7 +257,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_deviceId(deviceId), m_outputNeededDuringBackprop(true), m_parameterUpdateRequired(false), m_gradientInitialized(false), m_nodeName(name == L"" ? CreateUniqNodeName() : name), - m_numRows(0), m_numCols(0), m_valueSharable(true) + m_numRows(0), m_numCols(0) { } virtual ~ComputationNodeBase(){} @@ -455,14 +462,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { LogicError("VerifyNumParallelSequences: value inconsistent with MB layout"); } - bool isValueSharable() - { - return m_valueSharable; - } - virtual void MarkValueNonSharable() - { - m_valueSharable = false; - } + // sometimes, it is necessary to know whether it is a particular node (e.g., learnable parameter) + virtual bool isLearnableParameter() const { return false; } + protected: public: // ...the following should be protected, but nodes inquire about their children, requiring public access @@ -778,8 +780,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { bool m_gradientInitialized; // indicates whether the gradient matrix has been resized and initialized to 0 bool m_outputNeededDuringBackprop; // indicates whether the output value of the node is needed during backprop - // flags related with sharable values - bool m_valueSharable; // whether value is sharable }; typedef ComputationNodeBase::ComputationNodeBasePtr ComputationNodeBasePtr; diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index f7096fa34efa..06571ad6bb8f 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -197,6 +197,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { PrintNodeValuesToFile(printValues, fstream); } + + virtual bool isLearnableParameter()const override{ return true; } }; #if 0 From 4b1f8006b3ec4031804fac396f087842d602b02b Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Thu, 17 Dec 2015 17:44:22 -0800 Subject: [PATCH 25/49] Fix MarkValueNotSharableNodes --- .../ComputationNetwork.h | 2 +- .../ComputationNetworkEvaluation.cpp | 90 +++++++++---------- .../ComputationNetworkLib/ComputationNode.h | 6 +- .../InputAndParamNodes.h | 1 - 4 files changed, 48 insertions(+), 51 deletions(-) diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h index b533602786c5..e8d7ae87fe7e 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.h +++ b/Source/ComputationNetworkLib/ComputationNetwork.h @@ -159,7 +159,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb private: void ValidateNodes(list nodes, bool isFinalValidationPass, size_t & todo); void ValidateSubNetwork(const ComputationNodeBasePtr& rootNode); - void MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode); + void MarkValueNonSharableNodes(); private: void DetermineSetOfAllRoots(); void CollectInputAndLearnableParameters(const ComputationNodeBasePtr& rootNode); diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp index 03d085be8d2b..208692e4561e 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp @@ -10,6 +10,7 @@ #include "ComputationNode.h" #include "ComputationNetwork.h" #include "RecurrentNodes.h" +#include "InputAndParamNodes.h" #include #include #include @@ -414,13 +415,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { for (auto & node : m_allRoots) ValidateSubNetwork(node); - // STEP: mark non-sharable function values - // if all the descendants of a particular node are learnable parameters, - // its function value is not sharable - for (auto & node : m_allRoots) - MarkValueNonSharableNodes(node); - - // STEP: Optimize the network. // :) @@ -686,11 +680,48 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } +#if 0 + // prepare to compute with the subnetwork that this rootNode depends on, including + // - auto-detecting recurrent loops + // - collect input and learnable nodes + // - calling Validate() on all nodes lazily, which sizes all matrices (column dimensions get updated to MB size) + // Done lazily, called for every minibatch's invocation of EvaluateNode(), but memoizing which nodes were done already. + // BUGBUG? Lazy triggers on the root node. I.e. for two different root nodes (training, eval), it validates twice. + void ComputationNetwork::BuildAndValidateSubNetwork(const ComputationNodeBasePtr rootNode) + { + bool inserted = m_built.insert(rootNode).second; // remember we built it + if (!inserted) + return; // already done + + // detect recurrent loops for this root node + // TODO: not nice--why not always call this in ValidateSubNetwork() only? + FormRecurrentLoops(rootNode); + + // for the m_inputValues and m_learnableParameters sets for this rootNode + CollectInputAndLearnableParameters(rootNode); + + // validate the rootNode and all nodes it depends on, in evaluation order + ValidateSubNetwork(rootNode); + } + + // tests whether BuildAndValidateSubNetwork() was called + bool ComputationNetwork::BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode) + { + return m_built.find(rootNode) != m_built.end(); + } +#endif + + // ----------------------------------------------------------------------- + // memory allocation + // ----------------------------------------------------------------------- // mark nodes that are purely induced by parameters as non-sharable and create space for value if null - void ComputationNetwork::MarkValueNonSharableNodes(const ComputationNodeBasePtr& rootNode) + void ComputationNetwork::MarkValueNonSharableNodes() { - const auto & nodes = GetEvalOrder(rootNode); + const auto & nodes = GetEvalOrder(nullptr); std::map allLeafDescendentsAreParameters; + std::list allLearnableParameters = GetNodesWithType(OperationNameOf(LearnableParameter)); + // note that: we cannot use m_learnableParameters because we need all parameters node, regardless whether it requires update or not + for (auto& node : nodes) { auto children = node->GetInputs(); @@ -706,7 +737,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { // not found, means it is a leaf node (we are at eval order ) assert(child->IsLeaf()); - if (node->isLearnableParameter()) + if (std::find(allLearnableParameters.begin(), allLearnableParameters.end(), child)!= allLearnableParameters.end()) { allLeafDescendentsAreParameters[ChildName] = true; } @@ -740,40 +771,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { } -#if 0 - // prepare to compute with the subnetwork that this rootNode depends on, including - // - auto-detecting recurrent loops - // - collect input and learnable nodes - // - calling Validate() on all nodes lazily, which sizes all matrices (column dimensions get updated to MB size) - // Done lazily, called for every minibatch's invocation of EvaluateNode(), but memoizing which nodes were done already. - // BUGBUG? Lazy triggers on the root node. I.e. for two different root nodes (training, eval), it validates twice. - void ComputationNetwork::BuildAndValidateSubNetwork(const ComputationNodeBasePtr rootNode) - { - bool inserted = m_built.insert(rootNode).second; // remember we built it - if (!inserted) - return; // already done - - // detect recurrent loops for this root node - // TODO: not nice--why not always call this in ValidateSubNetwork() only? - FormRecurrentLoops(rootNode); - - // for the m_inputValues and m_learnableParameters sets for this rootNode - CollectInputAndLearnableParameters(rootNode); - - // validate the rootNode and all nodes it depends on, in evaluation order - ValidateSubNetwork(rootNode); - } - - // tests whether BuildAndValidateSubNetwork() was called - bool ComputationNetwork::BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode) - { - return m_built.find(rootNode) != m_built.end(); - } -#endif - - // ----------------------------------------------------------------------- - // memory allocation - // ----------------------------------------------------------------------- // this function will need to be called before actual validation and execution to // predetermine how to share matrices to reduce memory usage. @@ -788,9 +785,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { VerifyIsCompiled("AllocateAllMatrices"); + // Due to special topology, if a node is solely induced by parameters, its function value should not be shared + MarkValueNonSharableNodes(); + bool performingBackPropagation = (trainRootNode != nullptr); - // Create a composite Eval order with the specfied nodes as roots + // Create a composite Eval order with the specified nodes as roots std::vector forwardPropRoots; forwardPropRoots.insert(forwardPropRoots.end(), evalRootNodes.begin(), evalRootNodes.end()); forwardPropRoots.insert(forwardPropRoots.end(), outValueRootNodes.begin(), outValueRootNodes.end()); diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 8064bceb58f0..5edfdfe9aae9 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -168,7 +168,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { virtual void MarkValueNonSharable(){ m_valueSharable = false; } virtual void MarkValueSharable() { m_valueSharable = true; } - bool isValueSharable() { return m_valueSharable; } + bool isValueSharable() const { return m_valueSharable; } protected: // TODO: should be fully encapsulated here @@ -462,8 +462,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { LogicError("VerifyNumParallelSequences: value inconsistent with MB layout"); } - // sometimes, it is necessary to know whether it is a particular node (e.g., learnable parameter) - virtual bool isLearnableParameter() const { return false; } protected: public: // ...the following should be protected, but nodes inquire about their children, requiring public access @@ -547,7 +545,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; } bool IsOutputNeededDuringBackprop() const { - return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop; + return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop || !isValueSharable(); } const size_t GetNumInputs() const { return m_inputs.size(); } diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index 06571ad6bb8f..2a8a06c2beb4 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -198,7 +198,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { PrintNodeValuesToFile(printValues, fstream); } - virtual bool isLearnableParameter()const override{ return true; } }; #if 0 From 0b8e30ea22d0552f3749cc65a589daedd26f11da Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Fri, 18 Dec 2015 14:21:21 -0800 Subject: [PATCH 26/49] Revise the condition of ReleaseMatricesAfterForwardProp: only ValueSharable nodes can be released after forwardprop --- Source/ComputationNetworkLib/ComputationNode.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 5edfdfe9aae9..ec9da238ac50 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -545,7 +545,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; } bool IsOutputNeededDuringBackprop() const { - return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop || !isValueSharable(); + return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop ; } const size_t GetNumInputs() const { return m_inputs.size(); } @@ -912,7 +912,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { //don't release matrices that need to be used in the gradient computation virtual void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool) { - if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE)) + if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE) && isValueSharable()) ReleaseMatrixToPool(m_value, matrixPool); } From 2f51fb24b60d484122e3f1b5948df8c2f97d7d98 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Fri, 18 Dec 2015 23:32:59 -0800 Subject: [PATCH 27/49] Fix a bug in MarkValueSharableNode --- Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp | 2 +- Source/ComputationNetworkLib/ComputationNode.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp index 208692e4561e..3da5a5ce51df 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp @@ -736,7 +736,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (allLeafDescendentsAreParameters.find(ChildName) == allLeafDescendentsAreParameters.end()) { // not found, means it is a leaf node (we are at eval order ) - assert(child->IsLeaf()); + assert(child->IsLeaf() || child->IsPartOfLoop()); if (std::find(allLearnableParameters.begin(), allLearnableParameters.end(), child)!= allLearnableParameters.end()) { allLeafDescendentsAreParameters[ChildName] = true; diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index ec9da238ac50..fd90a96ac168 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -941,7 +941,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Release the Value matrix only if the output value is needed during backprop // since in the case it isn't used, we release it during forward prop itself - if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE) + if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE && isValueSharable()) ReleaseMatrixToPool(m_value, matrixPool); } } From b54cfccc37dd08eccae9c9e45baa6691fdd0d720 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Sat, 19 Dec 2015 00:18:13 -0800 Subject: [PATCH 28/49] Add an alternate option "numSubminibatches" for users to indicate how to split minibatches into subminibatches. --- Source/SGDLib/SGD.cpp | 25 ++++++++++++++++++------- Source/SGDLib/SGD.h | 6 +++++- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp index c816931c1884..bab3d7a896d1 100644 --- a/Source/SGDLib/SGD.cpp +++ b/Source/SGDLib/SGD.cpp @@ -764,13 +764,20 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Sub-minibatching is used if a single minibatch is too large to fit into GPU RAM. DataReaderHelpers::SubminibatchDispatcher smbDispatcher; size_t numSubminibatchesNeeded = 0; - if (m_maxSamplesInRAM < SIZE_MAX) // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled + if (m_maxSamplesInRAM < SIZE_MAX || m_numSubminiBatches > 1) // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled { - // into how many pieces would we need to break the minibatch? - // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed. - size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences(); - size_t estimatedMBSize = tunedMBSize * numParallelSequences; - numSubminibatchesNeeded = (size_t)std::ceil((float)estimatedMBSize / m_maxSamplesInRAM); + if (m_maxSamplesInRAM < SIZE_MAX) + { + // into how many pieces would we need to break the minibatch? + // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed. + size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences(); + size_t estimatedMBSize = tunedMBSize * numParallelSequences; + numSubminibatchesNeeded = (size_t)std::ceil((float)estimatedMBSize / m_maxSamplesInRAM); + } + if (m_numSubminiBatches > 1) + { + numSubminibatchesNeeded = m_numSubminiBatches; + } } // this is non-trivial, we need a manager object to handle this if (numSubminibatchesNeeded > 1) @@ -800,7 +807,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { } if (numSubminibatchesNeeded > 1) { - fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM); + if (m_maxSamplesInRAM < SIZE_MAX) + fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM); + else + fprintf(stderr, ", with %d subminibatch", (int)numSubminibatchesNeeded); } fprintf(stderr, ".\n"); @@ -2484,6 +2494,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_mbSize = configSGD(L"minibatchSize", ConfigRecordType::Array(intargvector(vector{ 256 }))); m_truncated = configSGD(L"truncated", false); m_maxSamplesInRAM = configSGD(L"maxSamplesInRAM", (size_t)SIZE_MAX); + m_numSubminiBatches = configSGD(L"numSubminibatches", (size_t)1); // the number of samples in each epoch (0 means, use all the samples in each epoch). m_epochSize = configSGD(L"epochSize", (size_t)0); diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h index 15143dfa0208..b99608b500f8 100644 --- a/Source/SGDLib/SGD.h +++ b/Source/SGDLib/SGD.h @@ -157,7 +157,11 @@ struct SGDParams : public ScriptableObjects::Object // To mitigate this issue, we adopt the sub-minibatch implementation, where // each m_mbSize[epoch] is divided by a few sub-minibatch of which size will be no more than m_maxSamplesInRAM // a forward-backward is performed for each sub-minibathch; a model update is performed after each minibatch - + size_t m_numSubminiBatches; + // alternative method to specify how to split minibatches into subminibatches + // default is 1, which means no subminibatch is used + // if m_maxTempMemSizeInSamples = SIZE_MAX (which means users do not specify the option) and m_numSubminiBatches > 1 + // we divide one minibatch to m_numSubminiBatches subMinibatches // the number of samples in each epoch (0 means, use all the samples in each epoch). size_t m_epochSize; From 6777bbe0b757b5204edd8b15a95c1857890e9fd3 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Mon, 21 Dec 2015 21:56:33 -0800 Subject: [PATCH 29/49] Display CUB and CUDNN paths (if defined) in BuildInfo Print BuildInfo at the very begining of the program. convenient for checking build type. --- Source/CNTK/CNTK.cpp | 8 ++++++-- Source/CNTK/prebuild.bat | 10 ++++++++++ Tools/generate_build_info | 3 +++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp index e7753e2c70b1..e85dd78015f5 100644 --- a/Source/CNTK/CNTK.cpp +++ b/Source/CNTK/CNTK.cpp @@ -345,6 +345,9 @@ void PrintBuiltInfo() #ifdef _CUB_PATH_ fprintf(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_); #endif +#ifdef _CUDNN_PATH_ + fprintf(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_); +#endif #ifdef _GIT_EXIST fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_); fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_); @@ -568,7 +571,7 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which i RedirectStdErr(logpath); } - PrintBuiltInfo(); + PrintBuiltInfo(); // this one goes to log file std::string timestamp = TimeDateStamp(); //dump config info @@ -643,10 +646,11 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which i // main wrapper that catches C++ exceptions and prints them // --------------------------------------------------------------------------- -int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & repots Win32 exceptions +int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & reports Win32 exceptions { try { + PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type) if (argc <= 1) InvalidArgument("No command-line argument given."); // detect legacy CNTK configuration diff --git a/Source/CNTK/prebuild.bat b/Source/CNTK/prebuild.bat index 9f841d104da7..12631cf52e37 100644 --- a/Source/CNTK/prebuild.bat +++ b/Source/CNTK/prebuild.bat @@ -33,6 +33,16 @@ if "%cuda_path%" == "" ( echo #define _CUDA_PATH_ "%cuda_path:\=\\%" >> buildinfo.h$$ ) +if not "%cudnn_path%" == "" ( + echo #define _CUDNN_PATH_ "%cudnn_path:\=\\%" >> buildinfo.h$$ + ) + +if not "%cub_path%" == "" ( + echo #define _CUB_PATH_ "%cub_path:\=\\%" >> buildinfo.h$$ + ) + + + echo #endif >> buildinfo.h$$ ::: update file only if it changed (otherwise CNTK.cpp will get rebuilt each time) diff --git a/Tools/generate_build_info b/Tools/generate_build_info index a155fc84e792..62686222ef33 100755 --- a/Tools/generate_build_info +++ b/Tools/generate_build_info @@ -56,6 +56,9 @@ makebuildinfo() if [ ! -z "$CUB_PATH" ]; then printf "#define _CUB_PATH_ \"%s\"\n" $CUB_PATH >> $target fi + if [ ! -z "$CUDNN_PATH" ]; then + printf "#define _CUDNN_PATH_ \"%s\"\n" $CUDNN_PATH >> $target + fi printf "#define _BUILDTYPE_ \"%s\"\n" $BUILDTYPE >> $target printf "#endif\n" >> $target } From 60989d7acbf06878c131dfd36aebae22096d54de Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Mon, 21 Dec 2015 23:20:13 -0800 Subject: [PATCH 30/49] Move MarkValueNonsharable out of constuctors (make gcc happy) --- Source/ComputationNetworkLib/CompositeComputationNodes.h | 6 ++---- Source/ComputationNetworkLib/ComputationNode.h | 3 +-- Source/ComputationNetworkLib/EsotericNodes.h | 3 +-- Source/ComputationNetworkLib/InputAndParamNodes.h | 1 - 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/Source/ComputationNetworkLib/CompositeComputationNodes.h b/Source/ComputationNetworkLib/CompositeComputationNodes.h index 5027ef6c2bd9..f8f79dc21642 100644 --- a/Source/ComputationNetworkLib/CompositeComputationNodes.h +++ b/Source/ComputationNetworkLib/CompositeComputationNodes.h @@ -234,8 +234,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { virtual void MarkComputed(const bool hasComputed) { m_hasComputed = hasComputed; - // CreateMatrixIfNull(m_value); - MarkValueNonSharable(); + CreateMatrixIfNull(m_value); } virtual bool RequiresPreCompute() const override { return true; } @@ -294,8 +293,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // this is for the special case: convertDBN needs this; because we initialize values directly from another well-trained model virtual void SideLoadFromMatrix(const Matrix& value) { - //CreateMatrixIfNull(m_value); - MarkValueNonSharable(); + CreateMatrixIfNull(m_value); m_value->SetValue(value); m_hasComputed = true; } diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index fd90a96ac168..b239cf1b633b 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -824,8 +824,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Since the dimensions are read as well, this function also updates m_numRows/m_numCols. void LoadValue(File& fstream) { - // CreateMatrixIfNull(m_value); - MarkValueNonSharable(); + CreateMatrixIfNull(m_value); fstream >> Value(); // above reads dimensions, so we must update our own m_numRows/m_numCols SetDims(TensorShape(Value().GetNumRows()), Value().GetNumCols()); diff --git a/Source/ComputationNetworkLib/EsotericNodes.h b/Source/ComputationNetworkLib/EsotericNodes.h index b5c6a6f46ee5..3f500421cf1d 100644 --- a/Source/ComputationNetworkLib/EsotericNodes.h +++ b/Source/ComputationNetworkLib/EsotericNodes.h @@ -1550,8 +1550,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { Base(deviceId, name) { Init(row_size, col_size); - //CreateMatrixIfNull(m_gradient); - MarkValueNonSharable(); + CreateMatrixIfNull(m_gradient); m_gradient->Resize(row_size, col_size); m_gradient->SetValue(0.0f); } diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index 2a8a06c2beb4..333ac54053fb 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -255,7 +255,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { void Init(const TensorShape & sampleLayout, bool isSparse) { m_isSparse = isSparse; - //CreateMatrixIfNull(m_value); MarkValueNonSharable(); if (isSparse) ConvertToSparseMatrix(); From 8aa59f700917e34653332d7c874be3860a73f0b7 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Tue, 22 Dec 2015 00:19:22 -0800 Subject: [PATCH 31/49] (further remove MarkValueNotSharable out of constructor) --- Source/ComputationNetworkLib/InputAndParamNodes.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index 333ac54053fb..879ca71019ec 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -255,7 +255,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { void Init(const TensorShape & sampleLayout, bool isSparse) { m_isSparse = isSparse; - MarkValueNonSharable(); if (isSparse) ConvertToSparseMatrix(); From 2d7b74e825d827dd780ac7df4e16fe8eeec61e46 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Tue, 22 Dec 2015 12:18:21 -0800 Subject: [PATCH 32/49] (Fix a bug in MarkValueSharable) --- Source/ComputationNetworkLib/InputAndParamNodes.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index 879ca71019ec..bf6fe610c035 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -255,12 +255,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { void Init(const TensorShape & sampleLayout, bool isSparse) { m_isSparse = isSparse; + CreateMatrixIfNull(m_value); if (isSparse) ConvertToSparseMatrix(); SetDims(sampleLayout, 0); UpdateFunctionValuesSize(); // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that) m_parameterUpdateRequired = false; + m_valueSharable = false; } protected: InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & sampleLayout, bool isSparse) : From 25fd18bf1f5b342b9c757fe4982dce77ee288fc5 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Mon, 11 Jan 2016 17:26:52 -0800 Subject: [PATCH 33/49] Fix an error in SequenceWithSoftmaxNode::RequestMatricesBeforeForwardProp --- Source/ComputationNetworkLib/TrainingCriterionNodes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/ComputationNetworkLib/TrainingCriterionNodes.h b/Source/ComputationNetworkLib/TrainingCriterionNodes.h index d150b5747703..1722f60aa938 100644 --- a/Source/ComputationNetworkLib/TrainingCriterionNodes.h +++ b/Source/ComputationNetworkLib/TrainingCriterionNodes.h @@ -1374,7 +1374,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { //request matrices needed to do node function value evaluation virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) { - Base::ReleaseMatricesAfterForwardProp(matrixPool); + Base::ReleaseMatricesAfterBackprop(matrixPool); ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool); ReleaseMatrixToPool(m_softmaxOfRight, matrixPool); ReleaseMatrixToPool(m_gammaFromLattice, matrixPool); From f76412385d8bd01c449d1c93c8701eb4b3bec859 Mon Sep 17 00:00:00 2001 From: Amit Agarwal Date: Mon, 11 Jan 2016 22:45:17 -0800 Subject: [PATCH 34/49] Bug workaround: The m_columnsValidityMask matrix in MBLayout type was being default initialized resulting in incorrectly selecting a bad GPU device. --- Source/Common/Include/Sequences.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h index 57e7e366ddda..c708623695e3 100644 --- a/Source/Common/Include/Sequences.h +++ b/Source/Common/Include/Sequences.h @@ -90,7 +90,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // construction // ------------------------------------------------------------------- - MBLayout(size_t numParallelSequences, size_t numTimeSteps) : m_distanceToStart(CPUDEVICE), m_distanceToEnd(CPUDEVICE) { Init(numParallelSequences, numTimeSteps); } + MBLayout(size_t numParallelSequences, size_t numTimeSteps) : m_distanceToStart(CPUDEVICE), m_distanceToEnd(CPUDEVICE), m_columnsValidityMask(CPUDEVICE) { Init(numParallelSequences, numTimeSteps); } MBLayout() : MBLayout(1, 0) { } // copy the content of another MBLayoutPtr over From f52e80cf8fcc192bee007937f4ec426043a70a94 Mon Sep 17 00:00:00 2001 From: Alexey Kamenev Date: Fri, 8 Jan 2016 11:39:52 -0800 Subject: [PATCH 35/49] Added CMA to BN node, updated samples. --- .../Miscellaneous/CIFAR-10/03_ResNet.config | 4 +-- .../Miscellaneous/CIFAR-10/03_ResNet.ndl | 12 ++++---- .../Image/Miscellaneous/CIFAR-10/Macros.ndl | 15 ++++++---- Source/CNTK/SynchronousExecutionEngine.cpp | 3 +- .../ComputationNetworkBuilder.cpp | 4 +-- .../ComputationNetworkBuilder.h | 2 +- .../ConvolutionalNodes.h | 28 +++++++++++++------ 7 files changed, 43 insertions(+), 25 deletions(-) diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config index 5497dcfab32a..45eb04b156fc 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config +++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config @@ -34,7 +34,7 @@ Train=[ minibatchSize=128 learningRatesPerMB=0.1*80:0.01*40:0.001 momentumPerMB=0.9 - maxEpochs=10 + maxEpochs=120 L2RegWeight=0.0001 dropoutRate=0 @@ -57,7 +57,7 @@ Train=[ height=32 channels=3 cropType=Random - cropRatio=1 + cropRatio=0.8 jitterType=UniRatio interpolations=Linear #meanFile= diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl index 5b53f5d14652..cdee45f2f4fe 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl +++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl @@ -48,14 +48,14 @@ DNN=[ rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue) rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue) - # pool - poolW = 3 - poolH = 3 - poolhStride = 2 - poolvStride = 2 + # Global average pooling + poolW = 8 + poolH = 8 + poolhStride = 1 + poolvStride = 1 pool = AveragePooling(rn3_3, poolW, poolH, poolhStride, poolvStride, imageLayout = "cudnn") - ol = DnnLastLayer(576, labelDim, pool, fc1WScale, fc1BValue) + ol = DnnLastLayer(cMap3, labelDim, pool, fc1WScale, fc1BValue) CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria) Err = ErrorPrediction(labels, ol, tag = Eval) diff --git a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl index 120369a40409..a5edb7ac0d11 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl +++ b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl @@ -11,7 +11,8 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, { W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale) b = Parameter(outMap, 1, init = fixedValue, value = bValue) - sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale) + #sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale) + sc = Parameter(outMap, 1, init = fixedValue, value = 1) m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) @@ -24,7 +25,8 @@ ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scScale) { W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale) b1 = Parameter(outMap, 1, init = fixedValue, value = bValue) - sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale) + #sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale) + sc1 = Parameter(outMap, 1, init = fixedValue, value = 1) m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) @@ -34,7 +36,8 @@ ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scScale) W2 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale) b2 = Parameter(outMap, 1, init = fixedValue, value = bValue) - sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale) + #sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale) + sc2 = Parameter(outMap, 1, init = fixedValue, value = 1) m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) @@ -48,7 +51,8 @@ ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scScale, W { W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale) b1 = Parameter(outMap, 1, init = fixedValue, value = bValue) - sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale) + #sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale) + sc1 = Parameter(outMap, 1, init = fixedValue, value = 1) m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) @@ -58,7 +62,8 @@ ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scScale, W W2 = Parameter(outMap, wCount, init = Gaussian, initValueScale = wScale) b2 = Parameter(outMap, 1, init = fixedValue, value = bValue) - sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale) + #sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale) + sc2 = Parameter(outMap, 1, init = fixedValue, value = 1) m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) diff --git a/Source/CNTK/SynchronousExecutionEngine.cpp b/Source/CNTK/SynchronousExecutionEngine.cpp index 5cd5e845b312..40a166d47be6 100644 --- a/Source/CNTK/SynchronousExecutionEngine.cpp +++ b/Source/CNTK/SynchronousExecutionEngine.cpp @@ -452,8 +452,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { bool eval = node->GetOptionalParameter("eval", "false"); bool spatial = node->GetOptionalParameter("spatial", "false"); double expAvgFactor = node->GetOptionalParameter("expAvgFactor", "1.0"); + ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "CHW")); - nodePtr = builder.BatchNormalization(nullptr, nullptr, nullptr, nullptr, nullptr, eval, spatial, expAvgFactor, name); + nodePtr = builder.BatchNormalization(nullptr, nullptr, nullptr, nullptr, nullptr, eval, spatial, expAvgFactor, imageLayoutKind, name); } } else diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp index cddee2800f30..4c6a94c6d84c 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp @@ -610,9 +610,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { template shared_ptr> ComputationNetworkBuilder::BatchNormalization(const ComputationNodePtr input, const ComputationNodePtr scale, const ComputationNodePtr bias, const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev, - bool eval, bool spatial, double expAvgFactor, const std::wstring nodeName) + bool eval, bool spatial, double expAvgFactor, ImageLayoutKind imageLayoutKind, const std::wstring nodeName) { - return net.AddNodeToNetAndAttachInputs(New>(net.GetDeviceId(), nodeName, eval, spatial, expAvgFactor), + return net.AddNodeToNetAndAttachInputs(New>(net.GetDeviceId(), nodeName, eval, spatial, expAvgFactor, imageLayoutKind), input, scale, bias, runMean, runInvStdDev); } diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h index ce0dc84ccfde..6237cf7cef9e 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h +++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h @@ -132,7 +132,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { ComputationNodePtr TimeReverse(const ComputationNodePtr input, const std::wstring nodeName = L""); ComputationNodePtr LookupTable(const ComputationNodePtr dictionary, const ComputationNodePtr input, const std::wstring nodeName = L""); ComputationNodePtr BatchNormalization(const ComputationNodePtr input, const ComputationNodePtr scale, const ComputationNodePtr bias, - const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev, bool eval = false, bool spatial = false, double expAvgFactor = 1, const std::wstring nodeName = L""); + const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev, bool eval = false, bool spatial = false, double expAvgFactor = 1, ImageLayoutKind imageLayoutKind = ImageLayoutKind::CHW, const std::wstring nodeName = L""); }; // create a new from config diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h index 92a3f073f9a3..d86deb2b3878 100644 --- a/Source/ComputationNetworkLib/ConvolutionalNodes.h +++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h @@ -591,15 +591,16 @@ namespace Microsoft { namespace MSR { namespace CNTK { static const std::wstring TypeName() { return L"BatchNormalization"; } public: BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring & name) : - Base(deviceId, name), m_eval(false), m_spatial(false), m_expAvgFactor(0) + Base(deviceId, name), m_eval(false), m_spatial(false), m_expAvgFactor(0), m_sampleCount(0), m_imageLayoutKind(ImageLayoutKind::CHW) { } - BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring & name, bool eval, bool spatial, double expAvgFactor) : - Base(deviceId, name), m_eval(eval), m_spatial(spatial), m_expAvgFactor(expAvgFactor) + BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring & name, bool eval, bool spatial, double expAvgFactor, ImageLayoutKind imageLayoutKind) : + Base(deviceId, name), m_eval(eval), m_spatial(spatial), m_expAvgFactor(expAvgFactor), m_imageLayoutKind(imageLayoutKind), m_sampleCount(0) { } BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) : - BatchNormalizationNode(configp->Get(L"deviceId"), L"", configp->Get(L"eval"), configp->Get(L"spatial"), configp->Get(L"expAvgFactor")) + BatchNormalizationNode(configp->Get(L"deviceId"), L"", configp->Get(L"eval"), configp->Get(L"spatial"), configp->Get(L"expAvgFactor"), + ImageLayoutKindFrom(configp->Get(L"imageLayout"))) { AttachInputs(configp, this->GetExpectedNumInputs()); } @@ -612,6 +613,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { fstream << m_eval; fstream << m_spatial; fstream << m_expAvgFactor; + fstream << (int32_t)m_imageLayoutKind; + fstream << m_sampleCount; } void Load(File& fstream, size_t modelVersion) override @@ -635,6 +638,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { fstream >> m_eval; fstream >> m_spatial; fstream >> m_expAvgFactor; + if (verWritten >= 0x00010002) + { + fstream >> m_imageLayoutKind; + fstream >> m_sampleCount; + } } void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override @@ -733,8 +741,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (isFinalValidationPass) { - const auto m_imageLayoutKind = ImageLayoutKind::CHW; // BUGBUG: Finish this. Must be serialized. - auto shape = GetSampleLayout(); if (m_factory == nullptr) @@ -794,8 +800,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { private: struct VersionInfo { - int32_t VerWrittenCur() const { return 0x00010001; } // Initial - int32_t VerReadableCur() const { return 0x00010001; } + //int32_t VerWrittenCur() const { return 0x00010001; } // Initial + int32_t VerWrittenCur() const { return 0x00010002; } // Added m_imageLayoutKind and m_sampleCount + int32_t VerReadableCur() const { return 0x00010002; } int32_t VerWeCanReadBack() const { return 0x00010001; } }; VersionInfo m_version; @@ -808,6 +815,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { bool m_spatial; // Smoothing factor. double m_expAvgFactor; + // Layout (e.g. CHW). + ImageLayoutKind m_imageLayoutKind; + // Sample count, used to compute cumulative moving average. + size_t m_sampleCount; + // Stores pre-computed on forward pass mean values that are used in gradient computation. shared_ptr> m_saveMean; // Stores pre-computed on forward pass InvStdDev values that are used in gradient computation. From cc2a836c85e04525817529381ae43f1e1c2a2607 Mon Sep 17 00:00:00 2001 From: Alexey Kamenev Date: Fri, 8 Jan 2016 16:57:27 -0800 Subject: [PATCH 36/49] Updated samples, added ResNet-50. --- .../CIFAR-10/02_BatchNormConv.ndl | 12 +- .../Miscellaneous/CIFAR-10/03_ResNet.config | 2 +- .../Miscellaneous/CIFAR-10/03_ResNet.mel | 5 + .../Miscellaneous/CIFAR-10/03_ResNet.ndl | 24 ++-- .../Image/Miscellaneous/CIFAR-10/Macros.ndl | 52 ++++---- .../Miscellaneous/ImageNet/ResNet/Macros.ndl | 28 ++-- .../ImageNet/ResNet/ResNet_152.ndl | 30 +++-- .../ImageNet/ResNet/ResNet_34.ndl | 24 ++-- .../ImageNet/ResNet/ResNet_50.config | 123 ++++++++++++++++++ .../ImageNet/ResNet/ResNet_50.ndl | 80 ++++++++++++ .../ConvolutionalNodes.h | 19 +-- 11 files changed, 316 insertions(+), 83 deletions(-) create mode 100644 Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.config create mode 100644 Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl diff --git a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.ndl b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.ndl index e67e96ea8f10..c446156a2bbb 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.ndl +++ b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.ndl @@ -19,8 +19,10 @@ ndlMnistMacros = [ conv3WScale = 1.414 conv3BValue = 0 - scScale = 0.03 + scValue = 1 + expAvg = 1 + fc1WScale = 12 fc1BValue = 0 fc2WScale = 1.5 @@ -35,7 +37,7 @@ DNN=[ hStride1 = 1 vStride1 = 1 # weight[cMap1, kW1 * kH1 * ImageC] - conv1 = ConvBNReLULayer(featScaled, cMap1, 75, kW1, kH1, hStride1, vStride1, conv1WScale, conv1BValue, scScale) + conv1 = ConvBNReLULayer(featScaled, cMap1, 75, kW1, kH1, hStride1, vStride1, conv1WScale, conv1BValue, scValue, expAvg) # pool1 pool1W = 3 @@ -51,7 +53,7 @@ DNN=[ hStride2 = 1 vStride2 = 1 # weight[cMap2, kW2 * kH2 * cMap1] - conv2 = ConvBNReLULayer(pool1, cMap2, 800, kW2, kH2, hStride2, vStride2, conv2WScale, conv2BValue, scScale) + conv2 = ConvBNReLULayer(pool1, cMap2, 800, kW2, kH2, hStride2, vStride2, conv2WScale, conv2BValue, scValue, expAvg) # pool2 pool2W = 3 @@ -67,7 +69,7 @@ DNN=[ hStride3 = 1 vStride3 = 1 # weight[cMap3, kW3 * kH3 * cMap2] - conv3 = ConvBNReLULayer(pool2, cMap3, 800, kW3, kH3, hStride3, vStride3, conv3WScale, conv3BValue, scScale) + conv3 = ConvBNReLULayer(pool2, cMap3, 800, kW3, kH3, hStride3, vStride3, conv3WScale, conv3BValue, scValue, expAvg) # pool3 pool3W = 3 @@ -77,7 +79,7 @@ DNN=[ pool3 = MaxPooling(conv3, pool3W, pool3H, pool3hStride, pool3vStride, imageLayout = "cudnn") hiddenDim = 64 - h1 = DnnBNReLULayer(576, hiddenDim, pool3, fc1WScale, fc1BValue) + h1 = DnnBNReLULayer(576, hiddenDim, pool3, fc1WScale, fc1BValue, scValue, expAvg) ol = DNNLastLayer(hiddenDim, labelDim, h1, fc2WScale, fc2BValue) CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria) diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config index 45eb04b156fc..c3fd40bfefff 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config +++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config @@ -34,7 +34,7 @@ Train=[ minibatchSize=128 learningRatesPerMB=0.1*80:0.01*40:0.001 momentumPerMB=0.9 - maxEpochs=120 + maxEpochs=160 L2RegWeight=0.0001 dropoutRate=0 diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel index 115e1f43af88..3c1ef2e34716 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel +++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel @@ -23,6 +23,9 @@ rn2_1.bn1_e = BatchNormalization(rn2_1.c1, rn2_1.sc1, rn2_1.b1, rn2_1.m1, rn2_1. SetNodeInput(rn2_1.y1, 0, rn2_1.bn1_e) rn2_1.bn2_e = BatchNormalization(rn2_1.c2, rn2_1.sc2, rn2_1.b2, rn2_1.m2, rn2_1.isd2, eval = true, spatial = true, imageLayout = "cudnn") SetNodeInput(rn2_1.p, 0, rn2_1.bn2_e) +#rn2_1.bn_proj_e = BatchNormalization(rn2_1.c_proj, rn2_1.sc_proj, rn2_1.b_proj, rn2_1.m_proj, rn2_1.isd_proj, eval = true, spatial = true, imageLayout = "cudnn") +SetNodeInput(rn2_1.p, 0, rn2_1.bn2_e) +#SetNodeInput(rn2_1.p, 1, rn2_1.bn_proj_e) rn2_2.bn1_e = BatchNormalization(rn2_2.c1, rn2_2.sc1, rn2_2.b1, rn2_2.m1, rn2_2.isd1, eval = true, spatial = true, imageLayout = "cudnn") SetNodeInput(rn2_2.y1, 0, rn2_2.bn1_e) @@ -37,7 +40,9 @@ SetNodeInput(rn2_3.p, 0, rn2_3.bn2_e) rn3_1.bn1_e = BatchNormalization(rn3_1.c1, rn3_1.sc1, rn3_1.b1, rn3_1.m1, rn3_1.isd1, eval = true, spatial = true, imageLayout = "cudnn") SetNodeInput(rn3_1.y1, 0, rn3_1.bn1_e) rn3_1.bn2_e = BatchNormalization(rn3_1.c2, rn3_1.sc2, rn3_1.b2, rn3_1.m2, rn3_1.isd2, eval = true, spatial = true, imageLayout = "cudnn") +#rn3_1.bn_proj_e = BatchNormalization(rn3_1.c_proj, rn3_1.sc_proj, rn3_1.b_proj, rn3_1.m_proj, rn3_1.isd_proj, eval = true, spatial = true, imageLayout = "cudnn") SetNodeInput(rn3_1.p, 0, rn3_1.bn2_e) +#SetNodeInput(rn3_1.p, 1, rn3_1.bn_proj_e) rn3_2.bn1_e = BatchNormalization(rn3_2.c1, rn3_2.sc1, rn3_2.b1, rn3_2.m1, rn3_2.isd1, eval = true, spatial = true, imageLayout = "cudnn") SetNodeInput(rn3_2.y1, 0, rn3_2.bn1_e) diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl index cdee45f2f4fe..d84a9de37dbe 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl +++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl @@ -17,7 +17,9 @@ LocalMacros = [ fc1WScale = 12 fc1BValue = 0 - scValue = 0.03 + scValue = 1 + + expAvg = 1 kW = 3 kH = 3 @@ -30,23 +32,23 @@ LocalMacros = [ DNN=[ cMap1 = 16 - conv1 = ConvBNReLULayer(featScaled, cMap1, 27, kW, kH, hStride1, vStride1, convWScale, convBValue, scValue) + conv1 = ConvBNReLULayer(featScaled, cMap1, 27, kW, kH, hStride1, vStride1, convWScale, convBValue, scValue, expAvg) - rn1_1 = ResNetNode2(conv1, cMap1, 144, kW, kH, convWScale, convBValue, scValue) - rn1_2 = ResNetNode2(rn1_1, cMap1, 144, kW, kH, convWScale, convBValue, scValue) - rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue) + rn1_1 = ResNetNode2(conv1, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_2 = ResNetNode2(rn1_1, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) cMap2 = 32 rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false) - rn2_1 = ResNetNode2Inc(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, rn2_1_Wproj) - rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue) - rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue) + rn2_1 = ResNetNode2Inc(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj) + rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) cMap3 = 64 rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false) - rn3_1 = ResNetNode2Inc(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, rn3_1_Wproj) - rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue) - rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue) + rn3_1 = ResNetNode2Inc(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj) + rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) # Global average pooling poolW = 8 diff --git a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl index a5edb7ac0d11..6f2dcde046cf 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl +++ b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl @@ -7,71 +7,77 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) y = RectifiedLinear(p); } -ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scScale) +ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, expAvg) { W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale) b = Parameter(outMap, 1, init = fixedValue, value = bValue) - #sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale) - sc = Parameter(outMap, 1, init = fixedValue, value = 1) + sc = Parameter(outMap, 1, init = fixedValue, value = scValue) m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn") - bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn") + bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn") y = RectifiedLinear(bn); } -ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scScale) +ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, expAvg) { W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale) b1 = Parameter(outMap, 1, init = fixedValue, value = bValue) - #sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale) - sc1 = Parameter(outMap, 1, init = fixedValue, value = 1) + sc1 = Parameter(outMap, 1, init = fixedValue, value = scValue) m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) c1 = Convolution(W1, inp, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn") - bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn") + bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn") y1 = RectifiedLinear(bn1); W2 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale) b2 = Parameter(outMap, 1, init = fixedValue, value = bValue) - #sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale) - sc2 = Parameter(outMap, 1, init = fixedValue, value = 1) + sc2 = Parameter(outMap, 1, init = fixedValue, value = scValue) m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn") - bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn") + bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn") p = Plus(bn2, inp) y2 = RectifiedLinear(p); } -ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scScale, Wproj) +ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, expAvg, Wproj) { + # First convolution layer. W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale) b1 = Parameter(outMap, 1, init = fixedValue, value = bValue) - #sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale) - sc1 = Parameter(outMap, 1, init = fixedValue, value = 1) + sc1 = Parameter(outMap, 1, init = fixedValue, value = scValue) m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) c1 = Convolution(W1, inp, kW, kH, outMap, 2, 2, zeroPadding = true, imageLayout = "cudnn") - bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn") + bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn") y1 = RectifiedLinear(bn1); + # Second convolution layer. W2 = Parameter(outMap, wCount, init = Gaussian, initValueScale = wScale) b2 = Parameter(outMap, 1, init = fixedValue, value = bValue) - #sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale) - sc2 = Parameter(outMap, 1, init = fixedValue, value = 1) + sc2 = Parameter(outMap, 1, init = fixedValue, value = scValue) m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn") - bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn") + bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn") - cproj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = "cudnn") - p = Plus(bn2, cproj) + # Projection convolution layer. + #b_proj = Parameter(outMap, 1, init = fixedValue, value = bValue) + #sc_proj = Parameter(outMap, 1, init = fixedValue, value = scValue) + #m_proj = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) + #isd_proj = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) + + c_proj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = "cudnn") + #bn_proj = BatchNormalization(c_proj, sc_proj, b_proj, m_proj, isd_proj, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn") + + #p = Plus(bn2, bn_proj) + p = Plus(bn2, c_proj) y2 = RectifiedLinear(p); } @@ -84,15 +90,15 @@ DnnReLULayer(inDim, outDim, x, wScale, bValue) y = RectifiedLinear(z) } -DnnBNReLULayer(inDim, outDim, x, wScale, bValue) +DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, expAvg) { W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale) b = Parameter(outDim, 1, init = fixedValue, value = bValue) - sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01) + sc = Parameter(outDim, 1, init = fixedValue, value = scValue) m = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false) isd = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false) t = Times(W, x) - bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, imageLayout = "cudnn") + bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, expAvgFactor = expAvg) y = RectifiedLinear(bn) } diff --git a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl index f3bc70221b4b..47af2feb1936 100644 --- a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl +++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl @@ -2,7 +2,7 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, { W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale) b = Parameter(outMap, 1, init = fixedValue, value = bValue) - sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue) + sc = Parameter(outMap, 1, init = fixedValue, value = scValue) m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) @@ -16,7 +16,7 @@ ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue) { W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale) b1 = Parameter(outMap, 1, init = fixedValue, value = bValue) - sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue) + sc1 = Parameter(outMap, 1, init = fixedValue, value = scValue) m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) @@ -26,7 +26,7 @@ ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue) W2 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale) b2 = Parameter(outMap, 1, init = fixedValue, value = bValue) - sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue) + sc2 = Parameter(outMap, 1, init = fixedValue, value = scValue) m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) @@ -41,7 +41,7 @@ ResNetNode2Conv(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, { W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale) b1 = Parameter(outMap, 1, init = fixedValue, value = bValue) - sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue) + sc1 = Parameter(outMap, 1, init = fixedValue, value = scValue) m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) @@ -51,7 +51,7 @@ ResNetNode2Conv(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, W2 = Parameter(outMap, wCount, init = Gaussian, initValueScale = wScale) b2 = Parameter(outMap, 1, init = fixedValue, value = bValue) - sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue) + sc2 = Parameter(outMap, 1, init = fixedValue, value = scValue) m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) @@ -69,7 +69,7 @@ ResNetNode3(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue) # 1x1 reducing convolution. W1 = Parameter(convMap, inMap, init = Gaussian, initValueScale = wScale) b1 = Parameter(convMap, 1, init = fixedValue, value = bValue) - sc1 = Parameter(convMap, 1, init = Gaussian, initValueScale = scValue) + sc1 = Parameter(convMap, 1, init = fixedValue, value = scValue) m1 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false) isd1 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false) @@ -80,7 +80,7 @@ ResNetNode3(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue) # 3x3 convolution. W2 = Parameter(convMap, convWCount, init = Gaussian, initValueScale = wScale) b2 = Parameter(convMap, 1, init = fixedValue, value = bValue) - sc2 = Parameter(convMap, 1, init = Gaussian, initValueScale = scValue) + sc2 = Parameter(convMap, 1, init = fixedValue, value = scValue) m2 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false) isd2 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false) @@ -91,7 +91,7 @@ ResNetNode3(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue) # 1x1 expanding convolution. W3 = Parameter(outMap, convMap, init = Gaussian, initValueScale = wScale) b3 = Parameter(outMap, 1, init = fixedValue, value = bValue) - sc3 = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue) + sc3 = Parameter(outMap, 1, init = fixedValue, value = scValue) m3 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd3 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) @@ -102,12 +102,12 @@ ResNetNode3(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue) y3 = RectifiedLinear(p); } -ResNetNode3Inc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, wProj) +ResNetNode3Inc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, wProj, projStride) { # 1x1 reducing convolution. W1 = Parameter(convMap, inMap, init = Gaussian, initValueScale = wScale) b1 = Parameter(convMap, 1, init = fixedValue, value = bValue) - sc1 = Parameter(convMap, 1, init = Gaussian, initValueScale = scValue) + sc1 = Parameter(convMap, 1, init = fixedValue, value = scValue) m1 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false) isd1 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false) @@ -118,18 +118,18 @@ ResNetNode3Inc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, # 3x3 convolution. W2 = Parameter(convMap, convWCount, init = Gaussian, initValueScale = wScale) b2 = Parameter(convMap, 1, init = fixedValue, value = bValue) - sc2 = Parameter(convMap, 1, init = Gaussian, initValueScale = scValue) + sc2 = Parameter(convMap, 1, init = fixedValue, value = scValue) m2 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false) isd2 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false) - c2 = Convolution(W2, y1, 3, 3, convMap, 2, 2, zeroPadding = true, imageLayout = "cudnn") + c2 = Convolution(W2, y1, 3, 3, convMap, projStride, projStride, zeroPadding = true, imageLayout = "cudnn") bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn") y2 = RectifiedLinear(bn2); # 1x1 expanding convolution. W3 = Parameter(outMap, convMap, init = Gaussian, initValueScale = wScale) b3 = Parameter(outMap, 1, init = fixedValue, value = bValue) - sc3 = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue) + sc3 = Parameter(outMap, 1, init = fixedValue, value = scValue) m3 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) isd3 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false) @@ -137,7 +137,7 @@ ResNetNode3Inc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, bn3 = BatchNormalization(c3, sc3, b3, m3, isd3, eval = false, spatial = true, imageLayout = "cudnn") # Increasing input dimension convolution - cProj = Convolution(wProj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = "cudnn") + cProj = Convolution(wProj, inp, 1, 1, outMap, projStride, projStride, zeroPadding = false, imageLayout = "cudnn") p = Plus(bn3, cProj) y3 = RectifiedLinear(p); diff --git a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl index 23b4fb86e038..71c54e3fff67 100644 --- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl +++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl @@ -8,8 +8,6 @@ ndlMacros = [ LabelDim = 1000 features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn") - featOffs = Const(0, rows = 150528) - featScaled = Plus(features, featOffs) labels = Input(LabelDim, tag = label) # Kernels width and height. @@ -28,7 +26,7 @@ ndlMacros = [ # Initial parameter values. convWScale = 7.07 convBValue = 0 - scValue = 0.03 + scValue = 1 fcWScale = 3.0 fcBValue = 1 ] @@ -41,16 +39,21 @@ DNN=[ cMap5 = 1024 cMap6 = 2048 - conv1 = ConvBNReLULayer(featScaled, cMap1, 147, 7, 7, 2, 2, convWScale, convBValue, scValue) - pool1 = MaxPooling(conv1, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn") + conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, convWScale, convBValue, scValue) + # Max pooling + pool1W = 2 + pool1H = 2 + pool1hs = 2 + pool1vs = 2 + pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn") rn1_1_Wproj = Parameter(cMap3, cMap1, init = fromFile, initFromFilePath = "$Proj64to256Filename$", needGradient = false) - rn1_1 = ResNetNode3Inc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, rn1_1_Wproj) + rn1_1 = ResNetNode3Inc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, rn1_1_Wproj, 1) rn1_2 = ResNetNode3(rn1_1, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue) rn1_3 = ResNetNode3(rn1_2, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue) rn2_1_Wproj = Parameter(cMap4, cMap3, init = fromFile, initFromFilePath = "$Proj256to512Filename$", needGradient = false) - rn2_1 = ResNetNode3Inc(rn1_3, cMap3, cMap2, cMap4, 1152, convWScale, convBValue, scValue, rn2_1_Wproj) + rn2_1 = ResNetNode3Inc(rn1_3, cMap3, cMap2, cMap4, 1152, convWScale, convBValue, scValue, rn2_1_Wproj, 2) rn2_2 = ResNetNode3(rn2_1, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue) rn2_3 = ResNetNode3(rn2_2, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue) rn2_4 = ResNetNode3(rn2_3, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue) @@ -60,7 +63,7 @@ DNN=[ rn2_8 = ResNetNode3(rn2_7, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue) rn3_1_Wproj = Parameter(cMap5, cMap4, init = fromFile, initFromFilePath = "$Proj512to1024Filename$", needGradient = false) - rn3_1 = ResNetNode3Inc(rn2_8, cMap4, cMap3, cMap5, 2304, convWScale, convBValue, scValue, rn3_1_Wproj) + rn3_1 = ResNetNode3Inc(rn2_8, cMap4, cMap3, cMap5, 2304, convWScale, convBValue, scValue, rn3_1_Wproj, 2) rn3_2 = ResNetNode3(rn3_1, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue) rn3_3 = ResNetNode3(rn3_2, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue) rn3_4 = ResNetNode3(rn3_3, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue) @@ -98,13 +101,18 @@ DNN=[ rn3_36= ResNetNode3(rn3_35, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue) rn4_1_Wproj = Parameter(cMap6, cMap5, init = fromFile, initFromFilePath = "$Proj1024to2048Filename$", needGradient = false) - rn4_1 = ResNetNode3Inc(rn3_36, cMap5, cMap4, cMap6, 4608, convWScale, convBValue, scValue, rn4_1_Wproj) + rn4_1 = ResNetNode3Inc(rn3_36, cMap5, cMap4, cMap6, 4608, convWScale, convBValue, scValue, rn4_1_Wproj, 2) rn4_2 = ResNetNode3(rn4_1, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue) rn4_3 = ResNetNode3(rn4_2, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue) - pool5 = AveragePooling(rn4_3, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn") + # Global average pooling + pool2W = 7 + pool2H = 7 + pool2hs = 1 + pool2vs = 1 + pool2 = AveragePooling(rn4_3, pool2W, pool2H, pool2hs, pool2vs, imageLayout = "cudnn") - ol = DnnLayer(8192, labelDim, pool5, fcWScale, fcBValue) + ol = DnnLayer(cMap6, labelDim, pool2, fcWScale, fcBValue) CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria) Err = ErrorPrediction(labels, ol, tag = Eval) diff --git a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl index c1297f32547e..73108ca6da1c 100644 --- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl +++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl @@ -17,16 +17,10 @@ ndlMacros = [ hs = 1 vs = 1 - # Pooling settings. - poolW = 2 - poolH = 2 - poolhs = 2 - poolvs = 2 - # Initial parameter values. convWScale = 7.07 convBValue = 0 - scValue = 0.03 + scValue = 1 fcWScale = 3.0 fcBValue = 1 ] @@ -34,7 +28,12 @@ ndlMacros = [ DNN=[ cMap1 = 64 conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, convWScale, convBValue, scValue) - pool1 = MaxPooling(conv1, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn") + # Max pooling + pool1W = 2 + pool1H = 2 + pool1hs = 2 + pool1vs = 2 + pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn") rn1_1 = ResNetNode2(pool1, cMap1, 576, kW, kH, convWScale, convBValue, scValue) rn1_2 = ResNetNode2(rn1_1, cMap1, 576, kW, kH, convWScale, convBValue, scValue) @@ -62,9 +61,14 @@ DNN=[ rn4_2 = ResNetNode2(rn4_1, cMap4, 4608, kW, kH, convWScale, convBValue, scValue) rn4_3 = ResNetNode2(rn4_2, cMap4, 4608, kW, kH, convWScale, convBValue, scValue) - pool5 = AveragePooling(rn4_3, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn") + # Global average pooling + pool2W = 7 + pool2H = 7 + pool2hs = 1 + pool2vs = 1 + pool5 = AveragePooling(rn4_3, pool2W, pool2H, pool2hs, pool2vs, imageLayout = "cudnn") - ol = DnnLayer(4608, labelDim, pool5, fcWScale, fcBValue) + ol = DnnLayer(cMap4, labelDim, pool5, fcWScale, fcBValue) CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria) Err = ErrorPrediction(labels, ol, tag = Eval) diff --git a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.config b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.config new file mode 100644 index 000000000000..520e68bc7551 --- /dev/null +++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.config @@ -0,0 +1,123 @@ +RootDir = "." + +ConfigDir = "$RootDir$" +DataDir = "$RootDir$" +OutputDir = "$RootDir$/Output" +ModelDir = "$OutputDir$/Models" + +ndlMacros=$ConfigDir$/Macros.ndl + +precision=float +deviceId=Auto + +command=Train:AddTop5Eval:Test + +parallelTrain=false + +stderr=$OutputDir$/ResNet_50 +traceLevel=1 + +Proj64to256Filename = $ConfigDir$/64to256.txt +Proj256to512Filename = $ConfigDir$/256to512.txt +Proj512to1024Filename = $ConfigDir$/512to1024.txt +Proj1024to2048Filename = $ConfigDir$/1024to2048.txt + +Train=[ + action=train + modelPath=$ModelDir$/ResNet_50 + + NDLNetworkBuilder=[ + networkDescription=$ConfigDir$/ResNet_50.ndl + ] + + SGD=[ + epochSize=0 + minibatchSize=32 + learningRatesPerMB=0.1*30:0.03*30:0.01*25:0.003*25:0.001 + momentumPerMB=0.9 + maxEpochs=120 + gradUpdateType=None + L2RegWeight=0.0001 + dropoutRate=0 + + ParallelTrain=[ + parallelizationMethod=DataParallelSGD + distributedMBReading=true + parallelizationStartEpoch=1 + DataParallelSGD=[ + gradientBits=1 + ] + ] + + numMBsToShowResult=100 + ] + + reader=[ + readerType=ImageReader + # Map file which maps images to labels using the following format: + # + # Example: + # C:\Data\ImageNet\2012\train\n01440764\n01440764_10026.JPEG0 + file=$DataDir$/train_map.txt + # Randomize images before every epoch. Possible values: None, Auto. Default: Auto. + randomize=Auto + features=[ + # Below are the required parameters. + width=224 + height=224 + channels=3 + # Below are the optional parameters. + # Possible values: Center, Random. Default: Center + cropType=Random + # Horizontal random flip, will be enabled by default if cropType=Random + #hflip=0 + # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1. + cropRatio=0.46666:0.875 + # Crop scale ratio jitter type. + # Possible values: None, UniRatio, UniLength, UniArea. Default: UniRatio + jitterType=UniRatio + # Interpolation to use when scaling image to width x height size. + # Possible values: nearest, linear, cubic, lanczos. Default: linear. + interpolations=Linear + # Stores mean values for each pixel in OpenCV matrix XML format. + meanFile=$ConfigDir$/ImageNet1K_mean.xml + ] + labels=[ + labelDim=1000 + ] + ] +] + +AddTop5Eval=[ + action=edit + CurModel=$ModelDir$/ResNet_50 + NewModel=$ModelDir$/ResNet_50.Top5 + editPath=$ConfigDir$/add_top5_layer.mel +] + +Test=[ + action=test + modelPath=$ModelDir$/ResNet_50.Top5 + # Set minibatch size for testing. + minibatchSize=32 + + NDLNetworkBuilder=[ + networkDescription=$ConfigDir$/ResNet_50.ndl + ] + + reader=[ + readerType=ImageReader + file=$DataDir$/val_map.txt + randomize=None + features=[ + width=224 + height=224 + channels=3 + cropType=Center + meanFile=$ConfigDir$/ImageNet1K_mean.xml + ] + labels=[ + labelDim=1000 + ] + ] +] diff --git a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl new file mode 100644 index 000000000000..4d38a2af9315 --- /dev/null +++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl @@ -0,0 +1,80 @@ +load=ndlMacros +run=DNN + +ndlMacros = [ + ImageW = 224 + ImageH = 224 + ImageC = 3 + LabelDim = 1000 + + features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn") + labels = Input(LabelDim, tag = label) + + # Kernels width and height. + kW = 3 + kH = 3 + # Kernel stride. + hs = 1 + vs = 1 + + # Initial parameter values. + convWScale = 7.07 + convBValue = 0 + scValue = 1 + fcWScale = 3.0 + fcBValue = 1 +] + +DNN=[ + cMap1 = 64 + cMap2 = 128 + cMap3 = 256 + cMap4 = 512 + cMap5 = 1024 + cMap6 = 2048 + + conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, convWScale, convBValue, scValue) + # Max pooling + pool1W = 2 + pool1H = 2 + pool1hs = 2 + pool1vs = 2 + pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn") + + rn1_1_Wproj = Parameter(cMap3, cMap1, init = fromFile, initFromFilePath = "$Proj64to256Filename$", needGradient = false) + rn1_1 = ResNetNode3Inc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, rn1_1_Wproj, 1) + rn1_2 = ResNetNode3(rn1_1, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue) + rn1_3 = ResNetNode3(rn1_2, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue) + + rn2_1_Wproj = Parameter(cMap4, cMap3, init = fromFile, initFromFilePath = "$Proj256to512Filename$", needGradient = false) + rn2_1 = ResNetNode3Inc(rn1_3, cMap3, cMap2, cMap4, 1152, convWScale, convBValue, scValue, rn2_1_Wproj, 2) + rn2_2 = ResNetNode3(rn2_1, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue) + rn2_3 = ResNetNode3(rn2_2, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue) + rn2_4 = ResNetNode3(rn2_3, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue) + + rn3_1_Wproj = Parameter(cMap5, cMap4, init = fromFile, initFromFilePath = "$Proj512to1024Filename$", needGradient = false) + rn3_1 = ResNetNode3Inc(rn2_4, cMap4, cMap3, cMap5, 2304, convWScale, convBValue, scValue, rn3_1_Wproj, 2) + rn3_2 = ResNetNode3(rn3_1, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue) + rn3_3 = ResNetNode3(rn3_2, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue) + rn3_4 = ResNetNode3(rn3_3, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue) + rn3_5 = ResNetNode3(rn3_4, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue) + rn3_6 = ResNetNode3(rn3_5, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue) + + rn4_1_Wproj = Parameter(cMap6, cMap5, init = fromFile, initFromFilePath = "$Proj1024to2048Filename$", needGradient = false) + rn4_1 = ResNetNode3Inc(rn3_6, cMap5, cMap4, cMap6, 4608, convWScale, convBValue, scValue, rn4_1_Wproj, 2) + rn4_2 = ResNetNode3(rn4_1, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue) + rn4_3 = ResNetNode3(rn4_2, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue) + + # Global average pooling + pool2W = 7 + pool2H = 7 + pool2hs = 1 + pool2vs = 1 + pool2 = AveragePooling(rn4_3, pool2W, pool2H, pool2hs, pool2vs, imageLayout = "cudnn") + + ol = DnnLayer(cMap6, labelDim, pool2, fcWScale, fcBValue) + + CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria) + Err = ErrorPrediction(labels, ol, tag = Eval) + OutputNodes = ol +] diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h index d86deb2b3878..dffa996af6cf 100644 --- a/Source/ComputationNetworkLib/ConvolutionalNodes.h +++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h @@ -591,11 +591,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { static const std::wstring TypeName() { return L"BatchNormalization"; } public: BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring & name) : - Base(deviceId, name), m_eval(false), m_spatial(false), m_expAvgFactor(0), m_sampleCount(0), m_imageLayoutKind(ImageLayoutKind::CHW) + Base(deviceId, name), m_eval(false), m_spatial(false), m_expAvgFactor(0), m_mbCount(0), m_imageLayoutKind(ImageLayoutKind::CHW) { } BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring & name, bool eval, bool spatial, double expAvgFactor, ImageLayoutKind imageLayoutKind) : - Base(deviceId, name), m_eval(eval), m_spatial(spatial), m_expAvgFactor(expAvgFactor), m_imageLayoutKind(imageLayoutKind), m_sampleCount(0) + Base(deviceId, name), m_eval(eval), m_spatial(spatial), m_expAvgFactor(expAvgFactor), m_imageLayoutKind(imageLayoutKind), m_mbCount(0) { } BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) : @@ -614,7 +614,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { fstream << m_spatial; fstream << m_expAvgFactor; fstream << (int32_t)m_imageLayoutKind; - fstream << m_sampleCount; + fstream << m_mbCount; } void Load(File& fstream, size_t modelVersion) override @@ -641,7 +641,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (verWritten >= 0x00010002) { fstream >> m_imageLayoutKind; - fstream >> m_sampleCount; + fstream >> m_mbCount; } } @@ -724,8 +724,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_convEng->NormalizeBatchInference(*m_inT, sliceInputValue, *m_scaleBiasT, scale, bias, m_spatial, runMean, runInvStdDev, sliceOutputValue); else { - m_convEng->NormalizeBatch(*m_inT, sliceInputValue, *m_scaleBiasT, scale, bias, m_spatial, m_expAvgFactor, runMean, runInvStdDev, + // REVIEW alexeyk: hack, use m_expAvgFactor <= 0 to compute CMA. + double expAvgFactor = (m_expAvgFactor > 0) ? m_expAvgFactor : (1.0 / (1.0 + m_mbCount)); + m_convEng->NormalizeBatch(*m_inT, sliceInputValue, *m_scaleBiasT, scale, bias, m_spatial, expAvgFactor, runMean, runInvStdDev, sliceOutputValue, *m_saveMean, *m_saveInvStdDev); + m_mbCount++; } #if NANCHECK sliceOutputValue.HasNan("BatchNormalization"); @@ -801,7 +804,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { struct VersionInfo { //int32_t VerWrittenCur() const { return 0x00010001; } // Initial - int32_t VerWrittenCur() const { return 0x00010002; } // Added m_imageLayoutKind and m_sampleCount + int32_t VerWrittenCur() const { return 0x00010002; } // Added m_imageLayoutKind and m_mbCount int32_t VerReadableCur() const { return 0x00010002; } int32_t VerWeCanReadBack() const { return 0x00010001; } }; @@ -817,8 +820,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { double m_expAvgFactor; // Layout (e.g. CHW). ImageLayoutKind m_imageLayoutKind; - // Sample count, used to compute cumulative moving average. - size_t m_sampleCount; + // Minibatch count, used to compute cumulative moving average. + size_t m_mbCount; // Stores pre-computed on forward pass mean values that are used in gradient computation. shared_ptr> m_saveMean; From 9e25b7e61a9ffede4ba903d3cfd86cb2d843c953 Mon Sep 17 00:00:00 2001 From: Alexey Kamenev Date: Tue, 12 Jan 2016 13:37:38 -0800 Subject: [PATCH 37/49] Removed Resize from BN code. Updated samples. --- .../Miscellaneous/CIFAR-10/03_ResNet.config | 6 +++--- .../Image/Miscellaneous/CIFAR-10/03_ResNet.ndl | 7 ++++--- .../ComputationNetworkLib/ConvolutionalNodes.h | 13 ++++++++++++- Source/Math/CuDnnConvolutionEngine.cpp | 17 ++++++++++++----- 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config index c3fd40bfefff..dd6c394a6471 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config +++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config @@ -34,7 +34,7 @@ Train=[ minibatchSize=128 learningRatesPerMB=0.1*80:0.01*40:0.001 momentumPerMB=0.9 - maxEpochs=160 + maxEpochs=80 L2RegWeight=0.0001 dropoutRate=0 @@ -60,7 +60,7 @@ Train=[ cropRatio=0.8 jitterType=UniRatio interpolations=Linear - #meanFile= + meanFile=$ConfigDir$/CIFAR-10_mean.xml ] labels=[ labelDim=10 @@ -97,7 +97,7 @@ Test=[ cropRatio=1 jitterType=UniRatio interpolations=Linear - #meanFile= + meanFile=$ConfigDir$/CIFAR-10_mean.xml ] labels=[ labelDim=10 diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl index d84a9de37dbe..f267db665f7a 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl +++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl @@ -8,8 +8,8 @@ LocalMacros = [ LabelDim = 10 features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn") - featOffs = Const(128) - featScaled = Minus(features, featOffs) + #featOffs = Const(128) + #featScaled = Minus(features, featOffs) labels = Input(LabelDim, tag = label) convWScale = 7.07 @@ -31,8 +31,9 @@ LocalMacros = [ ] DNN=[ + conv1WScale = 0.26 cMap1 = 16 - conv1 = ConvBNReLULayer(featScaled, cMap1, 27, kW, kH, hStride1, vStride1, convWScale, convBValue, scValue, expAvg) + conv1 = ConvBNReLULayer(features, cMap1, 27, kW, kH, hStride1, vStride1, conv1WScale, convBValue, scValue, expAvg) rn1_1 = ResNetNode2(conv1, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) rn1_2 = ResNetNode2(rn1_1, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h index dffa996af6cf..9c06b0795f23 100644 --- a/Source/ComputationNetworkLib/ConvolutionalNodes.h +++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h @@ -726,12 +726,23 @@ namespace Microsoft { namespace MSR { namespace CNTK { { // REVIEW alexeyk: hack, use m_expAvgFactor <= 0 to compute CMA. double expAvgFactor = (m_expAvgFactor > 0) ? m_expAvgFactor : (1.0 / (1.0 + m_mbCount)); + + if (m_saveMean->GetNumElements() != runMean.GetNumElements()) + m_saveMean->Resize(runMean.GetNumRows(), runMean.GetNumCols()); + if (m_saveInvStdDev->GetNumElements() != runMean.GetNumElements()) + m_saveInvStdDev->Resize(runMean.GetNumRows(), runMean.GetNumCols()); + m_convEng->NormalizeBatch(*m_inT, sliceInputValue, *m_scaleBiasT, scale, bias, m_spatial, expAvgFactor, runMean, runInvStdDev, sliceOutputValue, *m_saveMean, *m_saveInvStdDev); + m_mbCount++; } #if NANCHECK - sliceOutputValue.HasNan("BatchNormalization"); + sliceOutputValue.HasNan("BatchNormalization-output"); + runMean.HasNan("BatchNormalization-runMean"); + runInvStdDev.HasNan("BatchNormalization-runInvStdDev"); + m_saveMean->HasNan("BatchNormalization-saveMean"); + m_saveInvStdDev->HasNan("BatchNormalization-saveInvStdDev"); #endif } diff --git a/Source/Math/CuDnnConvolutionEngine.cpp b/Source/Math/CuDnnConvolutionEngine.cpp index ae9336d56910..e5c7c871c064 100644 --- a/Source/Math/CuDnnConvolutionEngine.cpp +++ b/Source/Math/CuDnnConvolutionEngine.cpp @@ -312,28 +312,35 @@ namespace Microsoft { namespace MSR { namespace CNTK { void NormalizeBatch(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias, bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out, Mat& saveMean, Mat& saveInvStdDev) override { + const size_t crowIn = inT.w() * inT.h() * inT.c(); + UNUSED(crowIn); // crowIn used only in asserts. if (spatial) { assert(scaleBiasT.c() == inT.c()); assert(scaleBiasT.w() == 1); assert(scaleBiasT.h() == 1); + assert(runMean.GetNumRows() == inT.c()); + assert(runMean.GetNumCols() == 1); + assert(runInvStdDev.GetNumRows() == inT.c()); + assert(runInvStdDev.GetNumCols() == 1); } else { assert(scaleBiasT.c() == inT.c()); assert(scaleBiasT.w() == inT.w()); assert(scaleBiasT.h() == inT.h()); + assert(runMean.GetNumRows() == crowIn); + assert(runMean.GetNumCols() == 1); + assert(runInvStdDev.GetNumRows() == crowIn); + assert(runInvStdDev.GetNumCols() == 1); } assert(scaleBiasT.n() == 1); - const size_t crowIn = inT.w() * inT.h() * inT.c(); assert(crowIn == in.GetNumRows()); assert(inT.n() == in.GetNumCols()); + assert(saveMean.GetNumElements() >= runMean.GetNumElements()); + assert(saveInvStdDev.GetNumElements() >= runInvStdDev.GetNumElements()); cudnnBatchNormMode_t mode = spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION; - runMean.Resize(spatial ? inT.c() : crowIn, 1); - runInvStdDev.Resize(runMean.GetNumRows(), 1); - saveMean.Resize(runMean.GetNumRows(), 1); - saveInvStdDev.Resize(runMean.GetNumRows(), 1); CUDNN_CALL(cudnnBatchNormalizationForwardTraining(m_cudnn, mode, &C::One, &C::Zero, t(inT), ptr(in), t(inT), ptr(out), t(scaleBiasT), ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev))); } From 92e8a4d136492be3bd3f5b4cd9031f3a5b1e45fb Mon Sep 17 00:00:00 2001 From: Alexey Kamenev Date: Tue, 12 Jan 2016 14:25:13 -0800 Subject: [PATCH 38/49] Added BN eval mode to MEL. Updated samples. --- .../CIFAR-10/02_BatchNormConv.mel | 12 +- .../Miscellaneous/CIFAR-10/03_ResNet.config | 2 +- .../Miscellaneous/CIFAR-10/03_ResNet.mel | 53 +-------- .../Miscellaneous/CIFAR-10/03_ResNet.ndl | 2 - .../CIFAR-10/04_ResNet_56.config | 106 +++++++++++++++++ .../Miscellaneous/CIFAR-10/04_ResNet_56.ndl | 110 ++++++++++++++++++ Source/CNTK/ModelEditLanguage.cpp | 44 ++++++- .../ComputationNetwork.h | 1 + .../ComputationNetworkEditing.cpp | 38 ++++++ .../ConvolutionalNodes.h | 5 + 10 files changed, 306 insertions(+), 67 deletions(-) create mode 100644 Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.config create mode 100644 Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.ndl diff --git a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.mel b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.mel index 809ac67784e5..c36f29c41b9b 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.mel +++ b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.mel @@ -1,16 +1,6 @@ m=LoadModel($CurModel$, format=cntk) SetDefaultModel(m) -conv1.bn_e = BatchNormalization(conv1.c, conv1.sc, conv1.b, conv1.m, conv1.isd, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(conv1.y, 0, conv1.bn_e) - -conv2.bn_e = BatchNormalization(conv2.c, conv2.sc, conv2.b, conv2.m, conv2.isd, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(conv2.y, 0, conv2.bn_e) - -conv3.bn_e = BatchNormalization(conv3.c, conv3.sc, conv3.b, conv3.m, conv3.isd, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(conv3.y, 0, conv3.bn_e) - -h1.bn_e = BatchNormalization(h1.t, h1.sc, h1.b, h1.m, h1.isd, eval = true, spatial = false) -SetNodeInput(h1.y, 0, h1.bn_e) +SetPropertyForSubTree(CE, batchNormEvalMode, true) SaveModel(m, $NewModel$, format=cntk) \ No newline at end of file diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config index dd6c394a6471..a32619c04a5b 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config +++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config @@ -34,7 +34,7 @@ Train=[ minibatchSize=128 learningRatesPerMB=0.1*80:0.01*40:0.001 momentumPerMB=0.9 - maxEpochs=80 + maxEpochs=160 L2RegWeight=0.0001 dropoutRate=0 diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel index 3c1ef2e34716..c36f29c41b9b 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel +++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel @@ -1,57 +1,6 @@ m=LoadModel($CurModel$, format=cntk) SetDefaultModel(m) -conv1.bn_e = BatchNormalization(conv1.c, conv1.sc, conv1.b, conv1.m, conv1.isd, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(conv1.y, 0, conv1.bn_e) - -rn1_1.bn1_e = BatchNormalization(rn1_1.c1, rn1_1.sc1, rn1_1.b1, rn1_1.m1, rn1_1.isd1, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn1_1.y1, 0, rn1_1.bn1_e) -rn1_1.bn2_e = BatchNormalization(rn1_1.c2, rn1_1.sc2, rn1_1.b2, rn1_1.m2, rn1_1.isd2, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn1_1.p, 0, rn1_1.bn2_e) - -rn1_2.bn1_e = BatchNormalization(rn1_2.c1, rn1_2.sc1, rn1_2.b1, rn1_2.m1, rn1_2.isd1, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn1_2.y1, 0, rn1_2.bn1_e) -rn1_2.bn2_e = BatchNormalization(rn1_2.c2, rn1_2.sc2, rn1_2.b2, rn1_2.m2, rn1_2.isd2, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn1_2.p, 0, rn1_2.bn2_e) - -rn1_3.bn1_e = BatchNormalization(rn1_3.c1, rn1_3.sc1, rn1_3.b1, rn1_3.m1, rn1_3.isd1, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn1_3.y1, 0, rn1_3.bn1_e) -rn1_3.bn2_e = BatchNormalization(rn1_3.c2, rn1_3.sc2, rn1_3.b2, rn1_3.m2, rn1_3.isd2, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn1_3.p, 0, rn1_3.bn2_e) - -rn2_1.bn1_e = BatchNormalization(rn2_1.c1, rn2_1.sc1, rn2_1.b1, rn2_1.m1, rn2_1.isd1, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn2_1.y1, 0, rn2_1.bn1_e) -rn2_1.bn2_e = BatchNormalization(rn2_1.c2, rn2_1.sc2, rn2_1.b2, rn2_1.m2, rn2_1.isd2, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn2_1.p, 0, rn2_1.bn2_e) -#rn2_1.bn_proj_e = BatchNormalization(rn2_1.c_proj, rn2_1.sc_proj, rn2_1.b_proj, rn2_1.m_proj, rn2_1.isd_proj, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn2_1.p, 0, rn2_1.bn2_e) -#SetNodeInput(rn2_1.p, 1, rn2_1.bn_proj_e) - -rn2_2.bn1_e = BatchNormalization(rn2_2.c1, rn2_2.sc1, rn2_2.b1, rn2_2.m1, rn2_2.isd1, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn2_2.y1, 0, rn2_2.bn1_e) -rn2_2.bn2_e = BatchNormalization(rn2_2.c2, rn2_2.sc2, rn2_2.b2, rn2_2.m2, rn2_2.isd2, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn2_2.p, 0, rn2_2.bn2_e) - -rn2_3.bn1_e = BatchNormalization(rn2_3.c1, rn2_3.sc1, rn2_3.b1, rn2_3.m1, rn2_3.isd1, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn2_3.y1, 0, rn2_3.bn1_e) -rn2_3.bn2_e = BatchNormalization(rn2_3.c2, rn2_3.sc2, rn2_3.b2, rn2_3.m2, rn2_3.isd2, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn2_3.p, 0, rn2_3.bn2_e) - -rn3_1.bn1_e = BatchNormalization(rn3_1.c1, rn3_1.sc1, rn3_1.b1, rn3_1.m1, rn3_1.isd1, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn3_1.y1, 0, rn3_1.bn1_e) -rn3_1.bn2_e = BatchNormalization(rn3_1.c2, rn3_1.sc2, rn3_1.b2, rn3_1.m2, rn3_1.isd2, eval = true, spatial = true, imageLayout = "cudnn") -#rn3_1.bn_proj_e = BatchNormalization(rn3_1.c_proj, rn3_1.sc_proj, rn3_1.b_proj, rn3_1.m_proj, rn3_1.isd_proj, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn3_1.p, 0, rn3_1.bn2_e) -#SetNodeInput(rn3_1.p, 1, rn3_1.bn_proj_e) - -rn3_2.bn1_e = BatchNormalization(rn3_2.c1, rn3_2.sc1, rn3_2.b1, rn3_2.m1, rn3_2.isd1, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn3_2.y1, 0, rn3_2.bn1_e) -rn3_2.bn2_e = BatchNormalization(rn3_2.c2, rn3_2.sc2, rn3_2.b2, rn3_2.m2, rn3_2.isd2, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn3_2.p, 0, rn3_2.bn2_e) - -rn3_3.bn1_e = BatchNormalization(rn3_3.c1, rn3_3.sc1, rn3_3.b1, rn3_3.m1, rn3_3.isd1, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn3_3.y1, 0, rn3_3.bn1_e) -rn3_3.bn2_e = BatchNormalization(rn3_3.c2, rn3_3.sc2, rn3_3.b2, rn3_3.m2, rn3_3.isd2, eval = true, spatial = true, imageLayout = "cudnn") -SetNodeInput(rn3_3.p, 0, rn3_3.bn2_e) +SetPropertyForSubTree(CE, batchNormEvalMode, true) SaveModel(m, $NewModel$, format=cntk) \ No newline at end of file diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl index f267db665f7a..3d3e69be6bb3 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl +++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl @@ -8,8 +8,6 @@ LocalMacros = [ LabelDim = 10 features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn") - #featOffs = Const(128) - #featScaled = Minus(features, featOffs) labels = Input(LabelDim, tag = label) convWScale = 7.07 diff --git a/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.config b/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.config new file mode 100644 index 000000000000..dc20fc41c14a --- /dev/null +++ b/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.config @@ -0,0 +1,106 @@ +RootDir = "." + +ConfigDir = "$RootDir$" +DataDir = "$RootDir$" +OutputDir = "$RootDir$/Output" +ModelDir = "$OutputDir$/Models" + +ndlMacros=$ConfigDir$/Macros.ndl + +precision=float +deviceId=Auto +prefetch=true +parallelTrain=false + +command=Train:AddBNEval:Test + +stderr=$OutputDir$/04_ResNet_56 +traceLevel=1 +numMBsToShowResult=200 + +Proj16to32Filename = $ConfigDir$/16to32.txt +Proj32to64Filename = $ConfigDir$/32to64.txt + +Train=[ + action=train + modelPath=$ModelDir$/04_ResNet_56 + + NDLNetworkBuilder=[ + networkDescription=$ConfigDir$/04_ResNet_56.ndl + ] + + SGD=[ + epochSize=0 + minibatchSize=128 + learningRatesPerMB=0.1*80:0.01*40:0.001 + momentumPerMB=0.9 + maxEpochs=1 + L2RegWeight=0.0001 + dropoutRate=0 + + ParallelTrain=[ + parallelizationMethod=DataParallelSGD + distributedMBReading=true + parallelizationStartEpoch=1 + DataParallelSGD=[ + gradientBits=1 + ] + ] + ] + + reader=[ + readerType=ImageReader + file=$DataDir$/train_map.txt + randomize=Auto + features=[ + width=32 + height=32 + channels=3 + cropType=Random + cropRatio=0.8 + jitterType=UniRatio + interpolations=Linear + meanFile=$ConfigDir$/CIFAR-10_mean.xml + ] + labels=[ + labelDim=10 + ] + ] +] + +AddBNEval=[ + action=edit + CurModel=$ModelDir$/04_ResNet_56 + NewModel=$ModelDir$/04_ResNet_56.Eval + editPath=$ConfigDir$/03_ResNet.mel +] + +Test=[ + action=test + modelPath=$ModelDir$/04_ResNet_56 + # Set minibatch size for testing. + minibatchSize=512 + + NDLNetworkBuilder=[ + networkDescription=$ConfigDir$/04_ResNet_56.ndl + ] + + reader=[ + readerType=ImageReader + file=$DataDir$/test_map.txt + randomize=Auto + features=[ + width=32 + height=32 + channels=3 + cropType=Center + cropRatio=1 + jitterType=UniRatio + interpolations=Linear + meanFile=$ConfigDir$/CIFAR-10_mean.xml + ] + labels=[ + labelDim=10 + ] + ] +] diff --git a/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.ndl b/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.ndl new file mode 100644 index 000000000000..98bceba2c9dd --- /dev/null +++ b/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.ndl @@ -0,0 +1,110 @@ +load=LocalMacros +run=DNN + +LocalMacros = [ + ImageW = 32 + ImageH = 32 + ImageC = 3 + LabelDim = 10 + + features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn") + labels = Input(LabelDim, tag = label) + + convWScale = 7.07 + convBValue = 0 + fc1WScale = 12 + fc1BValue = 0 + + scValue = 1 + + expAvg = 1 + + kW = 3 + kH = 3 + + hStride1 = 1 + vStride1 = 1 + hStride2 = 2 + vStride2 = 2 +] + +DNN=[ + conv1WScale = 0.26 + cMap1 = 16 + conv1 = ConvBNReLULayer(features, cMap1, 27, kW, kH, hStride1, vStride1, conv1WScale, convBValue, scValue, expAvg) + + rn1_1 = ResNetNode2(conv1, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_2 = ResNetNode2(rn1_1, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_4 = ResNetNode2(rn1_3, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_5 = ResNetNode2(rn1_4, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_6 = ResNetNode2(rn1_5, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_7 = ResNetNode2(rn1_6, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_8 = ResNetNode2(rn1_7, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_9 = ResNetNode2(rn1_8, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_10= ResNetNode2(rn1_9, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_11= ResNetNode2(rn1_10, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_12= ResNetNode2(rn1_11, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_13= ResNetNode2(rn1_12, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_14= ResNetNode2(rn1_13, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_15= ResNetNode2(rn1_14, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_16= ResNetNode2(rn1_15, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_17= ResNetNode2(rn1_16, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + rn1_18= ResNetNode2(rn1_17, cMap1, 144, kW, kH, convWScale, convBValue, scValue, expAvg) + + cMap2 = 32 + rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false) + rn2_1 = ResNetNode2Inc(rn1_18, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj) + rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_4 = ResNetNode2(rn2_3, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_5 = ResNetNode2(rn2_4, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_6 = ResNetNode2(rn2_5, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_7 = ResNetNode2(rn2_6, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_8 = ResNetNode2(rn2_7, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_9 = ResNetNode2(rn2_8, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_10= ResNetNode2(rn2_9, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_11= ResNetNode2(rn2_10, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_12= ResNetNode2(rn2_11, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_13= ResNetNode2(rn2_12, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_14= ResNetNode2(rn2_13, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_15= ResNetNode2(rn2_14, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_16= ResNetNode2(rn2_15, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_17= ResNetNode2(rn2_16, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + rn2_18= ResNetNode2(rn2_17, cMap2, 288, kW, kH, convWScale, convBValue, scValue, expAvg) + + cMap3 = 64 + rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false) + rn3_1 = ResNetNode2Inc(rn2_18, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj) + rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_4 = ResNetNode2(rn3_3, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_5 = ResNetNode2(rn3_4, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_6 = ResNetNode2(rn3_5, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_7 = ResNetNode2(rn3_6, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_8 = ResNetNode2(rn3_7, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_9 = ResNetNode2(rn3_8, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_10= ResNetNode2(rn3_9, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_11= ResNetNode2(rn3_10, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_12= ResNetNode2(rn3_11, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_13= ResNetNode2(rn3_12, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_14= ResNetNode2(rn3_13, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_15= ResNetNode2(rn3_14, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_16= ResNetNode2(rn3_15, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_17= ResNetNode2(rn3_16, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + rn3_18= ResNetNode2(rn3_17, cMap3, 576, kW, kH, convWScale, convBValue, scValue, expAvg) + + # Global average pooling + poolW = 8 + poolH = 8 + poolhStride = 1 + poolvStride = 1 + pool = AveragePooling(rn3_18, poolW, poolH, poolhStride, poolvStride, imageLayout = "cudnn") + + ol = DnnLastLayer(cMap3, labelDim, pool, fc1WScale, fc1BValue) + + CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria) + Err = ErrorPrediction(labels, ol, tag = Eval) + OutputNodes = ol +] + diff --git a/Source/CNTK/ModelEditLanguage.cpp b/Source/CNTK/ModelEditLanguage.cpp index 981b63ffd53f..9b1ec6fa176d 100644 --- a/Source/CNTK/ModelEditLanguage.cpp +++ b/Source/CNTK/ModelEditLanguage.cpp @@ -9,6 +9,7 @@ #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings #include "ModelEditLanguage.h" +#include "ConvolutionalNodes.h" #include namespace Microsoft { namespace MSR { namespace CNTK { @@ -56,7 +57,8 @@ enum MELProperty melPropFinalCriterion, melPropEvaluation, melPropOutput, - melPropRecurrent + melPropRecurrent, + melPropBatchNormMode }; // SetProperty - Set the Property on the passed node @@ -420,6 +422,10 @@ void MELScript::CallFunction(const std::string& p_name, const ConfigPa { prop = melPropEvaluation; } + else if (EqualInsensitive(propName, "batchNormEvalMode")) + { + prop = melPropBatchNormMode; + } else if (EqualInsensitive(propName, "output")) { prop = melPropOutput; @@ -485,6 +491,32 @@ void MELScript::CallFunction(const std::string& p_name, const ConfigPa // what to do here? break; } + case melPropBatchNormMode: + { + if (node->OperationName() != OperationNameOf(BatchNormalizationNode)) + { + RuntimeError("Invalid node type: node %ls (type:%ls) is not a %ls node; therefore cannot apply batchNormEvalMode on it.", + node->NodeName().c_str(), + node->OperationName().c_str(), + OperationNameOf(BatchNormalizationNode).c_str()); + } + bool property = params[2]; + auto pnode = dynamic_pointer_cast>(node); + if (pnode) + pnode->SetEvalMode(property); + else + { + auto pnode2 = dynamic_pointer_cast>(node); + if (pnode2) + pnode2->SetEvalMode(property); + else + { + RuntimeError("Invalid node type: node name=%ls. We assume either BatchNormalizationNode or BatchNormalizationNode\n", + node->NodeName().c_str()); + } + } + break; + } default: { RuntimeError("Invalid property, %s, is not supported", propName.c_str()); @@ -505,6 +537,10 @@ void MELScript::CallFunction(const std::string& p_name, const ConfigPa { prop = melPropComputeGradient; } + else if (EqualInsensitive(propName, "batchNormEvalMode")) + { + prop = melPropBatchNormMode; + } else { RuntimeError("Invalid property, %s, is not supported", propName.c_str()); @@ -527,6 +563,12 @@ void MELScript::CallFunction(const std::string& p_name, const ConfigPa netNdl->cn->SetLearnableNodesBelowNeedGradient(needGradient, node); break; } + case melPropBatchNormMode: + { + bool evalMode = params[2]; + netNdl->cn->SetBatchNormlizationNodesBelowEvalMode(evalMode, node); + break; + } default: { RuntimeError("Invalid property, %s, is not supported", propName.c_str()); diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h index 26b78d8be05f..0a9b3bf8ac29 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.h +++ b/Source/ComputationNetworkLib/ComputationNetwork.h @@ -344,6 +344,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb void AddFeatureNode(ComputationNodeBasePtr featureNode); void RemoveFeatureNode(ComputationNodeBasePtr featureNode); void SetLearnableNodesBelowNeedGradient(const bool needGradient, const ComputationNodeBasePtr& rootNode = nullptr); + void SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode = nullptr); // ----------------------------------------------------------------------- // node access diff --git a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp index bdd1a063114d..6dddc73a3942 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp @@ -10,6 +10,7 @@ #include "ComputationNode.h" #include "ComputationNetwork.h" #include "InputAndParamNodes.h" +#include "ConvolutionalNodes.h" #include #include #include @@ -314,4 +315,41 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } + void ComputationNetwork::SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode /* = nullptr */) + { + vector nodes; + if (rootNode == nullptr) + { + for (auto pair : m_nameToNodeMap) + { + nodes.push_back(pair.second); + } + } + else + { + auto allnodes = rootNode->EnumerateNodes(true); + for (auto node : allnodes) + nodes.push_back(node); + } + + for (auto& node : nodes) + { + if (node->OperationName() == OperationNameOf(BatchNormalizationNode)) + { + auto pNode = dynamic_pointer_cast>(node); + if (!pNode) + { + auto pNode2 = dynamic_pointer_cast>(node); + if (!pNode2) + { + RuntimeError("Invalid node type: node name=%ls. We assume either BatchNormalizationNode or BatchNormalizationNode\n", node->NodeName().c_str()); + } + } + else + { + pNode->SetEvalMode(evalMode); + } + } + } + } }}} diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h index 9c06b0795f23..3d2a7a34383d 100644 --- a/Source/ComputationNetworkLib/ConvolutionalNodes.h +++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h @@ -811,6 +811,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } + void SetEvalMode(bool bnEvalMode) + { + m_eval = bnEvalMode; + } + private: struct VersionInfo { From 7b0159a41daa845f61a9314eebe01be585551d2c Mon Sep 17 00:00:00 2001 From: Alexey Kamenev Date: Tue, 12 Jan 2016 14:53:46 -0800 Subject: [PATCH 39/49] Added Python conversion script, updated readme.txt. --- .../Miscellaneous/CIFAR-10/CifarConverter.py | 64 +++++++++++++++++++ .../Image/Miscellaneous/CIFAR-10/readme.txt | 5 ++ 2 files changed, 69 insertions(+) create mode 100644 Examples/Image/Miscellaneous/CIFAR-10/CifarConverter.py diff --git a/Examples/Image/Miscellaneous/CIFAR-10/CifarConverter.py b/Examples/Image/Miscellaneous/CIFAR-10/CifarConverter.py new file mode 100644 index 000000000000..b1be6d15b6d4 --- /dev/null +++ b/Examples/Image/Miscellaneous/CIFAR-10/CifarConverter.py @@ -0,0 +1,64 @@ +import os +import sys +import struct +import cPickle as cp +from PIL import Image +import numpy as np +import xml.etree.cElementTree as et +import xml.dom.minidom + +imgSize = 32 + +def saveImage(fname, data, label, mapFile, pad, **key_parms): + # data in CIFAR-10 dataset is in CHW format. + pixData = data.reshape((3, imgSize, imgSize)) + if ('mean' in key_parms): + key_parms['mean'] += pixData + + if pad > 0: + pixData = np.pad(pixData, ((0, 0), (pad, pad), (pad, pad)), mode = 'edge') + + img = Image.new('RGB', (imgSize + 2 * pad, imgSize + 2 * pad)) + pixels = img.load() + for x in range(img.size[0]): + for y in range(img.size[1]): + pixels[x, y] = (pixData[0][y][x], pixData[1][y][x], pixData[2][y][x]) + img.save(fname) + mapFile.write("%s\t%d\n" % (fname, label)) + +def saveMean(fname, data): + root = et.Element('opencv_storage') + et.SubElement(root, 'Channel').text = '3' + et.SubElement(root, 'Row').text = str(imgSize) + et.SubElement(root, 'Col').text = str(imgSize) + meanImg = et.SubElement(root, 'MeanImg', type_id='opencv-matrix') + et.SubElement(meanImg, 'rows').text = '1' + et.SubElement(meanImg, 'cols').text = str(imgSize * imgSize * 3) + et.SubElement(meanImg, 'dt').text = 'f' + et.SubElement(meanImg, 'data').text = ' '.join(['%e' % n for n in np.reshape(data, (imgSize * imgSize * 3))]) + + tree = et.ElementTree(root) + tree.write(fname) + x = xml.dom.minidom.parse(fname) + with open(fname, 'w') as f: + f.write(x.toprettyxml(indent = ' ')) + +if __name__ == "__main__": + rootDir = r'C:\Data\CIFAR-10' + '\\' + data = {} + dataMean = np.zeros((3, imgSize, imgSize)) # mean is in CHW format. + with open(rootDir + 'train_map.txt', 'w') as mapFile: + for ifile in range(1, 6): + with open(r'C:\Data\CIFAR-10\Python\data_batch_' + str(ifile), 'rb') as f: + data = cp.load(f) + for i in range(10000): + fname = '%sdata\\train\\%05d.png' % (rootDir, i + (ifile - 1) * 10000) + saveImage(fname, data['data'][i, :], data['labels'][i], mapFile, 4, mean=dataMean) + dataMean = dataMean / (50 * 1000) + saveMean('%sdata\\CIFAR-10_mean.xml' % rootDir, dataMean) + with open(rootDir + 'test_map.txt', 'w') as mapFile: + with open(r'C:\Data\CIFAR-10\Python\test_batch', 'rb') as f: + data = cp.load(f) + for i in range(10000): + fname = '%sdata\\test\\%05d.png' % (rootDir, i) + saveImage(fname, data['data'][i, :], data['labels'][i], mapFile, 0) diff --git a/Examples/Image/Miscellaneous/CIFAR-10/readme.txt b/Examples/Image/Miscellaneous/CIFAR-10/readme.txt index ea57413fcdfd..27bc2939e4c7 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/readme.txt +++ b/Examples/Image/Miscellaneous/CIFAR-10/readme.txt @@ -19,5 +19,10 @@ The network produces 21% of error after training for about 3 minutes on GPU. To run the sample, navigate to this folder and run the following command: configFile=01_Conv.config configName=01_Conv +02_BatchNormConv.ndl is a convolutional network which uses batch normalization technique (http://arxiv.org/abs/1502.03167). + +03_ResNet.ndl and 04_ResNet_56.ndl are very deep convolutional networks that use ResNet architecture and have 20 and 56 layers respectively (http://arxiv.org/abs/1512.03385). +With 03_ResNet.config you should get around 10% of error. + For more details, refer to .ndl and corresponding .config files. From 914ac61c96e6c1b5f6aea15a2b876b2310ab6597 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Tue, 12 Jan 2016 15:27:32 -0800 Subject: [PATCH 40/49] Fix an inconsistency after merge with master. --- Source/ComputationNetworkLib/CompositeComputationNodes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/ComputationNetworkLib/CompositeComputationNodes.h b/Source/ComputationNetworkLib/CompositeComputationNodes.h index 26223101317a..8400407b79d6 100644 --- a/Source/ComputationNetworkLib/CompositeComputationNodes.h +++ b/Source/ComputationNetworkLib/CompositeComputationNodes.h @@ -296,7 +296,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { CreateMatrixIfNull(m_value); m_value->SetValue(value); m_hasComputed = true; - SetDims(value.GetNumRows(), value.GetNumCols()); + SetDims(TensorShape(value.GetNumRows()), value.GetNumCols()); } public: bool m_hasComputed; From 5bb9fbf6e6b1a9b2115c76fd45ec614550c2a9bc Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Tue, 12 Jan 2016 15:50:02 -0800 Subject: [PATCH 41/49] Fix a bug pointed by Alexey. Thanks! --- Source/CNTK/ModelEditLanguage.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/CNTK/ModelEditLanguage.cpp b/Source/CNTK/ModelEditLanguage.cpp index 94a1dc185fb7..844fd2c1bd5a 100644 --- a/Source/CNTK/ModelEditLanguage.cpp +++ b/Source/CNTK/ModelEditLanguage.cpp @@ -538,7 +538,7 @@ void MELScript::CallFunction(const std::string& p_name, const ConfigPa { prop = melPropComputeGradient; } - if (EqualInsensitive(propName, "batchNormEvalMode")) + else if (EqualInsensitive(propName, "batchNormEvalMode")) { prop = melPropBatchNormMode; } From c45401fbd095f39e90165d4185ce812ff2d1f5ec Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Tue, 12 Jan 2016 17:08:46 -0800 Subject: [PATCH 42/49] Add Nesterov's momentum. to use NAG, simply add useNAG=true --- Source/Math/Matrix.cpp | 51 ++++++++++++++++++++++++++++++-- Source/Math/Matrix.h | 2 +- Source/SGDLib/MultiNetworksSGD.h | 2 +- Source/SGDLib/SGD.cpp | 18 +++++++---- Source/SGDLib/SGD.h | 8 +++-- 5 files changed, 69 insertions(+), 12 deletions(-) diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp index 3650db859f33..52a9375796eb 100644 --- a/Source/Math/Matrix.cpp +++ b/Source/Math/Matrix.cpp @@ -1383,17 +1383,62 @@ namespace Microsoft { namespace MSR { namespace CNTK { } template - void Matrix::NormalGrad(Matrix& gradients, Matrix& functionValues, const ElemType learnRatePerSample, const ElemType momentum) + void Matrix::NormalGrad(Matrix& gradients, + Matrix& functionValues, + const ElemType learnRatePerSample, + const ElemType momentum, + const bool useNesterovMomentum + ) { DecideAndMoveToRightDevice(*this, gradients, functionValues); - - DISPATCH_MATRIX_ON_FLAG(&gradients, + + if (!useNesterovMomentum) + { + DISPATCH_MATRIX_ON_FLAG(&gradients, nullptr, ScaleAndAdd((1-momentum) * learnRatePerSample, gradients, momentum, *this); functionValues -= *this, ScaleAndAdd((1-momentum) * learnRatePerSample, gradients, momentum, *this); functionValues -= *this, if (momentum != 0) gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues), if (momentum != 0) gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues) ); + } + else + { + DISPATCH_MATRIX_ON_FLAG(&gradients, + nullptr, + {/* CPU dense */ + ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this); + ScaleAndAdd(-momentum, *this, functionValues); + ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradients, functionValues); + // w_t = w_{t-1} - momentum * v_ {t-1} - (1-momentum)*learnRatePerSampele*gardient, + }, + {/* GPU dense */ + ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this); + ScaleAndAdd(-momentum, *this, functionValues); + ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradients, functionValues); + }, + { /* CPU sparse */ + if (momentum != 0) + { + Matrix gradientCache(gradients.GetDeviceId()); + gradientCache.SetValue(gradients); + gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum); + ScaleAndAdd(-momentum, *this, functionValues); + ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradientCache, functionValues); + } + }, + { /* GPU sparse */ + if (momentum != 0) + { + Matrix gradientCache(gradients.GetDeviceId()); + gradientCache.SetValue(gradients); + gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum); + ScaleAndAdd(-momentum, *this, functionValues); + ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradientCache, functionValues); + } + } + ); + } } //both this and gradients will be changed diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index 3791695299ff..94eb0dd53642 100644 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -164,7 +164,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { void ShiftBy(int numShift); // TODO: all these scalars should be passed as doubles and cast down inside - void NormalGrad(Matrix& gradients, Matrix& functionValues, const ElemType learnRatePerSample, const ElemType momentum); + void NormalGrad(Matrix& gradients, Matrix& functionValues, const ElemType learnRatePerSample, const ElemType momentum, const bool useNAG); ElemType Adagrad(Matrix& gradients, const bool needAveMultiplier); void FSAdagrad(size_t mbSize, Matrix& gradients, Matrix& functionValues, const ElemType learnRatePerSample, const ElemType momentum); ElemType RmsProp(Matrix& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier); diff --git a/Source/SGDLib/MultiNetworksSGD.h b/Source/SGDLib/MultiNetworksSGD.h index 19f3f202526b..48d54f9af507 100644 --- a/Source/SGDLib/MultiNetworksSGD.h +++ b/Source/SGDLib/MultiNetworksSGD.h @@ -930,7 +930,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { Matrix& smoothedGradient = (*smoothedGradientIter); - UpdateWeights(node, smoothedGradient, learnRatePerSample, GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, dataReader[0]->GetNumParallelSequences()), actualMBSize, m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier); + UpdateWeights(node, smoothedGradient, learnRatePerSample, GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, dataReader[0]->GetNumParallelSequences()), actualMBSize, m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier, m_useNesterovMomentum); } } } diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp index bab3d7a896d1..854f3d8551a3 100644 --- a/Source/SGDLib/SGD.cpp +++ b/Source/SGDLib/SGD.cpp @@ -1001,7 +1001,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { UpdateWeights(node, smoothedGradient, learnRatePerSample, GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, net->GetMBLayoutPtr()->GetNumParallelSequences()), aggregateNumSamples, m_L2RegWeight, m_L1RegWeight, - m_needAveMultiplier); + m_needAveMultiplier, m_useNesterovMomentum); #ifdef _DEBUG if (dynamic_pointer_cast>(node)->Value().HasNan("TrainOneEpoch/UpdateWeights(): ")) LogicError("%ls %ls operation has NaNs in functionValues after parameter update.", node->NodeName().c_str(), node->OperationName().c_str()); @@ -2022,7 +2022,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { size_t actualMBSize, const double L2RegWeight, const double L1RegWeight, - const bool needAveMultiplier) + const bool needAveMultiplier, + const bool useNesterovMomentum + ) { // we use simple linear (instead of log linear) scaling here const double momentum = MomentumPerMB(momentumPerSample, actualMBSize); @@ -2063,7 +2065,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (adpType == GradientsUpdateType::None) { smoothedGradient.NormalGrad(gradientValues, functionValues, - (ElemType)learnRatePerSample, (ElemType)momentum); + (ElemType)learnRatePerSample, (ElemType)momentum, useNesterovMomentum); } else if (adpType == GradientsUpdateType::AdaGrad || (adpType == GradientsUpdateType::RmsProp && gradientValues.GetMatrixType() == MatrixType::SPARSE) || @@ -2113,7 +2115,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { const double momentumPerSample, const size_t actualMBSize, const double L2RegWeight, const double L1RegWeight, - const bool needAveMultiplier) const + const bool needAveMultiplier, + const bool useNesterovMomentum + ) const { #if DUMPOUTPUT fprintf(stderr, "Update_%ls\n", node->NodeName().c_str()); @@ -2124,7 +2128,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { UpdateWeightsS(this, dynamic_pointer_cast>(node)->Value(), dynamic_pointer_cast>(node)->Gradient(), smoothedGradient, learnRatePerSample, momentumPerSample, actualMBSize, L2RegWeight, L1RegWeight, - needAveMultiplier); + needAveMultiplier, m_useNesterovMomentum); node->BumpEvalTimeStamp(); } @@ -2514,6 +2518,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { floatargvector momentumPerMB = configSGD(L"momentumPerMB", ConfigRecordType::Array(floatargvector())); floatargvector momentumPerSample = configSGD(L"momentumPerSample", ConfigRecordType::Array(floatargvector())); floatargvector momentumAsTimeConstant = configSGD(L"momentumAsTimeConstant", ConfigRecordType::Array(floatargvector())); + bool useNesterovMomentum = configSGD(L"useNAG", false); + m_maxTempMemSizeInSamplesForCNN = configSGD(L"maxTempMemSizeInSamplesForCNN", (size_t)0); @@ -2633,6 +2639,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_momentumParam = floatargvector(L"0.9"); m_momentumSpecifiedForMBSize = m_mbSize; } + m_useNesterovMomentum = useNesterovMomentum; + for (int i = 0; i < m_momentumParam.size(); i++) { if (m_momentumParam[i] >= 1.0 || m_momentumParam[i] < 0.0) diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h index b99608b500f8..453f941c5bda 100644 --- a/Source/SGDLib/SGD.h +++ b/Source/SGDLib/SGD.h @@ -111,6 +111,7 @@ struct SGDParams : public ScriptableObjects::Object intargvector m_learningRatesSpecifiedForMBSize; // 1 for per sample, m_mbSize[] for per MB floatargvector m_momentumParam; intargvector m_momentumSpecifiedForMBSize; + bool m_useNesterovMomentum; // Determine the MB size used for mapping a given learning-rate or momentum parameter to a per-sample value. // MB size is the number of samples across all time steps and parallel sequences. @@ -440,7 +441,9 @@ class SGD : public SGDParams size_t actualMBSize, const double L2RegWeight, const double L1RegWeight, - const bool needAveMultiplier); + const bool needAveMultiplier, + const bool useNesterovMomentum + ); protected: // UpdateWeights - update the weights in @@ -450,7 +453,8 @@ class SGD : public SGDParams const double momentumPerSample, const size_t actualMBSize, const double L2RegWeight, const double L1RegWeight, - const bool needAveMultiplier) const; + const bool needAveMultiplier, + const bool useNesterovMomentum) const; void ClipGradient(Matrix& gradient, const size_t actualMBSize) const; From f0655f04d9ab0f6f828724b25bcac307a16c3098 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Tue, 12 Jan 2016 18:12:11 -0800 Subject: [PATCH 43/49] (make gcc happy) --- Source/SGDLib/MultiNetworksSGD.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Source/SGDLib/MultiNetworksSGD.h b/Source/SGDLib/MultiNetworksSGD.h index 48d54f9af507..01ba28c098fe 100644 --- a/Source/SGDLib/MultiNetworksSGD.h +++ b/Source/SGDLib/MultiNetworksSGD.h @@ -63,6 +63,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { using SGDBase::m_L2RegWeight; using SGDBase::m_L1RegWeight; using SGDBase::m_needAveMultiplier; + using SGDBase::m_useNesterovMomentum; using SGDBase::m_traceLevel; using SGDBase::m_numMBsToShowResult; using SGDBase::m_gradientCheckSigDigit; From 270726e324f774ce2339474a6cbe6726050f999d Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Tue, 12 Jan 2016 23:58:28 -0800 Subject: [PATCH 44/49] Make SequenceGammar calculation parameters configurable. --- .../ComputationNetwork.cpp | 19 ++++++++-- .../ComputationNetwork.h | 14 +++++++- .../TrainingCriterionNodes.h | 16 +++++++++ Source/SGDLib/SGD.cpp | 8 ++++- Source/SGDLib/SGD.h | 5 +++ Source/SequenceTrainingLib/gammacalculation.h | 36 ++++++++++++++++++- 6 files changed, 92 insertions(+), 6 deletions(-) diff --git a/Source/ComputationNetworkLib/ComputationNetwork.cpp b/Source/ComputationNetworkLib/ComputationNetwork.cpp index 8da7ba6c71fa..8e1019c25722 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.cpp +++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp @@ -622,7 +622,17 @@ namespace Microsoft { namespace MSR { namespace CNTK { //set sequence training parameters, e.g. smoothing weight, frame drop threshhold template - void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign) + void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, + const ComputationNodeBasePtr criterionNode, + const double& hsmoothingWeight, + const double& frameDropThresh, + const bool& doreferencealign, + const double& amf /*= 14.0f*/, + const double& lmf /*= 14.0f*/, + const double& wp /*= 0.0f*/, + const double& bMMIfactor /*= 0.0f*/, + const bool& sMBR /*= false*/ + ) { fprintf(stderr, "Setting Hsmoothing weight to %.8g and frame-dropping threshhold to %.8g\n", hsmoothingWeight, frameDropThresh); list seqNodes = net->GetNodesWithType(OperationNameOf(SequenceWithSoftmaxNode), criterionNode); @@ -638,6 +648,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { node->SetSmoothWeight(hsmoothingWeight); node->SetFrameDropThresh(frameDropThresh); node->SetReferenceAlign(doreferencealign); + node->SetGammarCalculationParam(amf, lmf, wp, bMMIfactor, sMBR); } } } @@ -1118,14 +1129,16 @@ namespace Microsoft { namespace MSR { namespace CNTK { template void ComputationNetwork::LoadPersistableParameters(File & fstream, bool create); template void ComputationNetwork::PerformSVDecomposition(const map& SVDConfig, size_t alignedsize); template /*static*/void ComputationNetwork::SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed); - template void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign); + template void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign, + const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR); template void ComputationNetwork::InitLearnableParameters(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly); template void ComputationNetwork::Load(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork); template void ComputationNetwork::LoadPersistableParameters(File & fstream, bool create); template void ComputationNetwork::PerformSVDecomposition(const map& SVDConfig, size_t alignedsize); template /*static*/void ComputationNetwork::SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed); - template void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign); + template void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign, + const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR); // register ComputationNetwork with the ScriptableObject system ScriptableObjects::ConfigurableRuntimeTypeRegister::Add registerComputationNetwork(L"ComputationNetwork"); diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h index 6d41a0b89569..3921d7434094 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.h +++ b/Source/ComputationNetworkLib/ComputationNetwork.h @@ -412,8 +412,20 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb template static void SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed); + + + template - static void SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign); + static void SetSeqParam(ComputationNetworkPtr net, + const ComputationNodeBasePtr criterionNode, + const double& hsmoothingWeight, + const double& frameDropThresh, + const bool& doreferencealign, + const double& amf=14.0f, + const double& lmf=14.0f, + const double& wp=0.0f, + const double& bMMIfactor=0.0f, + const bool& sMBR=false); static void SetMaxTempMemSizeForCNN(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const size_t maxTempMemSizeInSamples); // ----------------------------------------------------------------------- diff --git a/Source/ComputationNetworkLib/TrainingCriterionNodes.h b/Source/ComputationNetworkLib/TrainingCriterionNodes.h index 1722f60aa938..a0f00586c3cf 100644 --- a/Source/ComputationNetworkLib/TrainingCriterionNodes.h +++ b/Source/ComputationNetworkLib/TrainingCriterionNodes.h @@ -1418,6 +1418,17 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_doReferenceAlignment = doreferencealign; } + void SetGammarCalculationParam(const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR) + { + msra::lattices::SeqGammarCalParam param; + param.amf = amf; + param.lmf = lmf; + param.wp = wp; + param.bMMIfactor = bMMIfactor; + param.sMBRmode = sMBR; + m_gammaCalculator.SetGammarCalculationParams(param); + } + void gettime(unsigned long long &gammatime, unsigned long long &partialtime) { gammatime = m_gammatime; @@ -1430,6 +1441,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { shared_ptr> m_gammaFromLattice; double m_frameDropThreshold; double m_fsSmoothingWeight; // frame-sequence criterion interpolation weight --TODO: can this be done outside? + double m_seqGammarAMF; + double m_seqGammarLMF; + double m_seqGammarWP; + double m_seqGammarbMMIFactor; + double m_seqGammarUsesMBR; bool m_doReferenceAlignment; std::vector> m_lattices; msra::asr::simplesenonehmm m_hmm; diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp index 854f3d8551a3..11290caa33cb 100644 --- a/Source/SGDLib/SGD.cpp +++ b/Source/SGDLib/SGD.cpp @@ -303,7 +303,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { // likewise for sequence training parameters if (isSequenceTrainingCriterion) { - ComputationNetwork::SetSeqParam(net, criterionNodes[0], m_hSmoothingWeight, m_frameDropThresh, m_doReferenceAlign); + ComputationNetwork::SetSeqParam(net, criterionNodes[0], m_hSmoothingWeight, m_frameDropThresh, m_doReferenceAlign, + m_seqGammarCalcAMF, m_seqGammarCalcLMF, m_seqGammarCalcWP, m_seqGammarCalcbMMIFactor, m_seqGammarCalcUsesMBR ); } // --- MAIN EPOCH LOOP @@ -2534,6 +2535,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_hSmoothingWeight = configSGD(L"hSmoothingWeight", 0.95); m_frameDropThresh = configSGD(L"frameDropThresh", 1e-10); m_doReferenceAlign = configSGD(L"doReferenceAlign", false); + m_seqGammarCalcUsesMBR = configSGD(L"seqGammarUsesMBR", false); + m_seqGammarCalcAMF = configSGD(L"seqGammarAMF", 14.0); + m_seqGammarCalcLMF = configSGD(L"seqGammarLMF", 14.0); + m_seqGammarCalcbMMIFactor = configSGD(L"seqGammarBMMIFactor", 0.0); + m_seqGammarCalcWP = configSGD(L"seqGammarWordPen", 0.0); m_dropoutRates = configSGD(L"dropoutRate", ConfigRecordType::Array(floatargvector(vector{ 0.0f }))); diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h index 453f941c5bda..a014ec1d2b13 100644 --- a/Source/SGDLib/SGD.h +++ b/Source/SGDLib/SGD.h @@ -250,6 +250,11 @@ struct SGDParams : public ScriptableObjects::Object double m_hSmoothingWeight; double m_frameDropThresh; bool m_doReferenceAlign; + double m_seqGammarCalcAMF; + double m_seqGammarCalcLMF; + double m_seqGammarCalcWP; + double m_seqGammarCalcbMMIFactor; + bool m_seqGammarCalcUsesMBR; }; template class IDistGradAggregator; diff --git a/Source/SequenceTrainingLib/gammacalculation.h b/Source/SequenceTrainingLib/gammacalculation.h index f8a60e5bd478..f63c5048793c 100644 --- a/Source/SequenceTrainingLib/gammacalculation.h +++ b/Source/SequenceTrainingLib/gammacalculation.h @@ -11,6 +11,23 @@ #pragma warning (disable: 4127) // conditional expression is constant namespace msra { namespace lattices { + + struct SeqGammarCalParam{ + double amf; + double lmf; + double wp; + double bMMIfactor; + bool sMBRmode; + SeqGammarCalParam() + { + amf = 14.0; + lmf = 14.0; + wp = 0.0; + bMMIfactor = 0.0; + sMBRmode = false; + } + }; + template class GammaCalculation { @@ -30,6 +47,9 @@ namespace msra { namespace lattices { } + //======================================== + // Sec. 1 init functions + //======================================== void init(msra::asr::simplesenonehmm hset, int DeviceId) { m_deviceid = DeviceId; @@ -47,7 +67,21 @@ namespace msra { namespace lattices { } } - + //======================================== + // Sec. 2 set functions + //======================================== + void SetGammarCalculationParams(const SeqGammarCalParam& gammarParam) + { + lmf = (float)gammarParam.lmf; + amf = (float)gammarParam.amf; + wp = (float)gammarParam.wp; + seqsMBRmode = gammarParam.sMBRmode; + boostmmifactor = (float)gammarParam.bMMIfactor; + } + + //======================================== + // Sec. 3 calculation functions + //======================================== void calgammaformb( Microsoft::MSR::CNTK::Matrix& functionValues, std::vector> &lattices, const Microsoft::MSR::CNTK::Matrix& loglikelihood, From ba61abd79ec85b0177a2c665a2e814adb059d939 Mon Sep 17 00:00:00 2001 From: Marko Radmilac Date: Fri, 8 Jan 2016 16:41:45 -0800 Subject: [PATCH 45/49] Disable popups on Windows --- Makefile | 2 +- Source/CNTK/CNTK.cpp | 2 ++ Source/EvalDll/EvalDll.vcxproj | 4 ++-- Source/Math/Math.vcxproj | 4 ++-- Source/Math/MathCUDA.vcxproj | 2 +- Source/Readers/BinaryReader/BinaryReader.vcxproj | 4 ++-- Source/Readers/DSSMReader/DSSMReader.vcxproj | 4 ++-- Source/Readers/DataReaderTest/DataReaderTest.vcxproj | 8 ++++---- Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj | 4 ++-- Source/Readers/ImageReader/ImageReader.vcxproj | 2 +- Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj | 4 ++-- Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj | 4 ++-- .../Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj | 4 ++-- Source/Readers/SparsePCReader/SparsePCReader.vcxproj | 4 ++-- Source/Readers/UCIFastReader/UCIFastReader.vcxproj | 4 ++-- Source/Readers/UCIReader/UCIReader.vcxproj | 8 ++++---- Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj | 2 +- 17 files changed, 34 insertions(+), 32 deletions(-) diff --git a/Makefile b/Makefile index 6c95042491d6..c2c474bffada 100644 --- a/Makefile +++ b/Makefile @@ -162,7 +162,7 @@ ifeq ("$(BUILDTYPE)","debug") CXXFLAGS += -g LDFLAGS += -rdynamic CPPFLAGS += -D_DEBUG - CUFLAGS += -O0 -use_fast_math -lineinfo $(GENCODE_FLAGS) + CUFLAGS += -O0 -g -use_fast_math -lineinfo $(GENCODE_FLAGS) endif ifeq ("$(BUILDTYPE)","release") diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp index e7753e2c70b1..94c91db877fe 100644 --- a/Source/CNTK/CNTK.cpp +++ b/Source/CNTK/CNTK.cpp @@ -684,6 +684,8 @@ void terminate_this() { fprintf(stderr, "terminate_this: aborting\n"), fflush(st int wmain(int argc, wchar_t* argv[]) // wmain wrapper that reports Win32 exceptions { set_terminate (terminate_this); // insert a termination handler to ensure stderr gets flushed before actually terminating + _set_error_mode(_OUT_TO_STDERR); // make sure there are no CRT prompts when CNTK is executing + // Note: this does not seem to work--processes with this seem to just hang instead of terminating __try { diff --git a/Source/EvalDll/EvalDll.vcxproj b/Source/EvalDll/EvalDll.vcxproj index 71e515bc8520..a535ca3ff1c2 100644 --- a/Source/EvalDll/EvalDll.vcxproj +++ b/Source/EvalDll/EvalDll.vcxproj @@ -74,7 +74,7 @@ /bigobj %(AdditionalOptions) - Windows + Console true ComputationNetworkLib.lib; Math.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\; "c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib" @@ -102,7 +102,7 @@ Speed - Windows + Console true true true diff --git a/Source/Math/Math.vcxproj b/Source/Math/Math.vcxproj index 950fab3417f8..5a101393d128 100644 --- a/Source/Math/Math.vcxproj +++ b/Source/Math/Math.vcxproj @@ -79,7 +79,7 @@ true - Windows + Console true libacml_mp_dll.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\ @@ -127,7 +127,7 @@ MultiThreadedDLL - Windows + Console true true true diff --git a/Source/Math/MathCUDA.vcxproj b/Source/Math/MathCUDA.vcxproj index 9a1a5a6bec08..201b770a5687 100644 --- a/Source/Math/MathCUDA.vcxproj +++ b/Source/Math/MathCUDA.vcxproj @@ -91,7 +91,7 @@ true - Windows + Console true cudart.lib;cublas.lib;cusparse.lib;curand.lib;libacml_mp_dll.lib;%(AdditionalDependencies) true diff --git a/Source/Readers/BinaryReader/BinaryReader.vcxproj b/Source/Readers/BinaryReader/BinaryReader.vcxproj index 208fab6bc435..ac0f40baceec 100644 --- a/Source/Readers/BinaryReader/BinaryReader.vcxproj +++ b/Source/Readers/BinaryReader/BinaryReader.vcxproj @@ -70,7 +70,7 @@ true - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -91,7 +91,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/DSSMReader/DSSMReader.vcxproj b/Source/Readers/DSSMReader/DSSMReader.vcxproj index 1412fac38f20..d607a7c9fc38 100644 --- a/Source/Readers/DSSMReader/DSSMReader.vcxproj +++ b/Source/Readers/DSSMReader/DSSMReader.vcxproj @@ -72,7 +72,7 @@ /bigobj %(AdditionalOptions) - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -93,7 +93,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/DataReaderTest/DataReaderTest.vcxproj b/Source/Readers/DataReaderTest/DataReaderTest.vcxproj index 8a422f187006..438c7daede3a 100644 --- a/Source/Readers/DataReaderTest/DataReaderTest.vcxproj +++ b/Source/Readers/DataReaderTest/DataReaderTest.vcxproj @@ -100,7 +100,7 @@ true - Windows + Console true $(VCInstallDir)UnitTest\lib;%(AdditionalLibraryDirectories) @@ -115,7 +115,7 @@ true - Windows + Console true $(VCInstallDir)UnitTest\lib;%(AdditionalLibraryDirectories) ucireader.lib;Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) @@ -133,7 +133,7 @@ true - Windows + Console true true true @@ -152,7 +152,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj b/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj index de7772889858..fd8f9c343f67 100644 --- a/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj +++ b/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj @@ -69,7 +69,7 @@ true - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) @@ -87,7 +87,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/ImageReader/ImageReader.vcxproj b/Source/Readers/ImageReader/ImageReader.vcxproj index b5061adafd76..7d3a3b01c636 100644 --- a/Source/Readers/ImageReader/ImageReader.vcxproj +++ b/Source/Readers/ImageReader/ImageReader.vcxproj @@ -75,7 +75,7 @@ true - Windows + Console true Math.lib;$(OpenCVLib);%(AdditionalDependencies) diff --git a/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj b/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj index 24a8a11122be..93b527173fed 100644 --- a/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj +++ b/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj @@ -71,7 +71,7 @@ true - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -92,7 +92,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj b/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj index bb68dd89df85..a73d0af74088 100644 --- a/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj +++ b/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj @@ -71,7 +71,7 @@ true - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -92,7 +92,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj b/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj index e3a10c534203..e5d8ac1fb2b9 100644 --- a/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj +++ b/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj @@ -72,7 +72,7 @@ /bigobj %(AdditionalOptions) - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -93,7 +93,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/SparsePCReader/SparsePCReader.vcxproj b/Source/Readers/SparsePCReader/SparsePCReader.vcxproj index 72d18defe990..db66c6d311d5 100644 --- a/Source/Readers/SparsePCReader/SparsePCReader.vcxproj +++ b/Source/Readers/SparsePCReader/SparsePCReader.vcxproj @@ -72,7 +72,7 @@ /bigobj %(AdditionalOptions) - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -93,7 +93,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/UCIFastReader/UCIFastReader.vcxproj b/Source/Readers/UCIFastReader/UCIFastReader.vcxproj index fc0e03ffa999..e30dc6b90299 100644 --- a/Source/Readers/UCIFastReader/UCIFastReader.vcxproj +++ b/Source/Readers/UCIFastReader/UCIFastReader.vcxproj @@ -70,7 +70,7 @@ true - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -91,7 +91,7 @@ true - Windows + Console true true true diff --git a/Source/Readers/UCIReader/UCIReader.vcxproj b/Source/Readers/UCIReader/UCIReader.vcxproj index 2e25c2b5779f..08cce8205b9b 100644 --- a/Source/Readers/UCIReader/UCIReader.vcxproj +++ b/Source/Readers/UCIReader/UCIReader.vcxproj @@ -91,7 +91,7 @@ true - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) ..\..\Source\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -107,7 +107,7 @@ ..\..\common\include;..\..\Source\Math - Windows + Console true Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\$(Configuration)\;..\..\Source\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration) @@ -124,7 +124,7 @@ true - Windows + Console true true true @@ -144,7 +144,7 @@ ..\..\common\include;..\..\Source\Math - Windows + Console true true true diff --git a/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj b/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj index c7c9d407325b..b379735411ff 100644 --- a/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj +++ b/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj @@ -44,7 +44,7 @@ true - Windows + Console true From 19a9895d1a1c49378feb3986a4ee1fcf7b98b9e5 Mon Sep 17 00:00:00 2001 From: Yongqiang Wang Date: Wed, 13 Jan 2016 12:25:20 -0800 Subject: [PATCH 46/49] Print SeqGammar related parameters for better logging. --- Source/ComputationNetworkLib/ComputationNetwork.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Source/ComputationNetworkLib/ComputationNetwork.cpp b/Source/ComputationNetworkLib/ComputationNetwork.cpp index 8e1019c25722..8f1bf6fae73e 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.cpp +++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp @@ -635,6 +635,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { ) { fprintf(stderr, "Setting Hsmoothing weight to %.8g and frame-dropping threshhold to %.8g\n", hsmoothingWeight, frameDropThresh); + fprintf(stderr, "Setting SeqGammar-related parameters: amf=%.2f, lmf=%.2f, wp=%.2f, bMMIFactor=%.2f, usesMBR=%s\n", + amf, lmf, wp, bMMIfactor, sMBR ? "true" : "false"); list seqNodes = net->GetNodesWithType(OperationNameOf(SequenceWithSoftmaxNode), criterionNode); if (seqNodes.size() == 0) { From d39d87f03bd1a6d44c7ec6aae863bc09643eb9d7 Mon Sep 17 00:00:00 2001 From: thhoens Date: Sat, 9 Jan 2016 04:24:23 -0800 Subject: [PATCH 47/49] Fixed a bug where the m_elemSizeAllocated was used instead of m_nz --- Source/Math/GPUSparseMatrix.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Source/Math/GPUSparseMatrix.h b/Source/Math/GPUSparseMatrix.h index 63234dabe9b3..3f125330a8d8 100644 --- a/Source/Math/GPUSparseMatrix.h +++ b/Source/Math/GPUSparseMatrix.h @@ -87,9 +87,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { return (MajorIndexLocation() + (m_format == matrixFormatSparseCSC ? SecondaryIndexValueAt(0) : 0)); } + // TODO: Comment these methods more thoroughly, e.g., why it uses numNZ instead of m_elemSizeAllocated. size_t MajorIndexCount() const { - return MajorIndexCount(m_numRows, m_numCols, m_elemSizeAllocated, m_format); + return MajorIndexCount(m_numRows, m_numCols, m_nz, m_format); } size_t MajorIndexCount(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat format) const { @@ -113,6 +114,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { return MajorIndexLocation() + m_numRows; else return MajorIndexLocation() + m_elemSizeAllocated + m_sliceViewOffset; + //return MajorIndexLocation() + m_elemSizeAllocated + m_sliceViewOffset; } size_t SecondaryIndexCount(const size_t numRows, const size_t numCols, const size_t numNZReserved, const MatrixFormat format) const { From f835efd05234fa5ed088dec1ab1744e22a9402cf Mon Sep 17 00:00:00 2001 From: thhoens Date: Tue, 12 Jan 2016 16:01:29 -0800 Subject: [PATCH 48/49] Fix for multi GPU to share all parameters required to adjust learning rate. --- Source/SGDLib/SGD.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp index 11290caa33cb..039721a108c5 100644 --- a/Source/SGDLib/SGD.cpp +++ b/Source/SGDLib/SGD.cpp @@ -513,6 +513,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if ((m_parallelizationMethod == ParallelizationMethod::ModelAveragingSGD) && (g_mpi->NumNodesInUse() > 1)) { g_mpi->Bcast(&epochCriterion, 1, g_mpi->MainNodeRank()); + g_mpi->Bcast(&lrControlCriterion, 1, g_mpi->MainNodeRank()); } bool loadedPrevModel = false; From de5be29239fc7553fe050cd02413e8045b0dbad7 Mon Sep 17 00:00:00 2001 From: thhoens Date: Tue, 12 Jan 2016 16:23:04 -0800 Subject: [PATCH 49/49] Minor performance upgrade on row slicing to avoid GPU mem copy. --- Source/Math/GPUSparseMatrix.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/Math/GPUSparseMatrix.cu b/Source/Math/GPUSparseMatrix.cu index 7e4f7a1c6fb0..3d4635020a88 100644 --- a/Source/Math/GPUSparseMatrix.cu +++ b/Source/Math/GPUSparseMatrix.cu @@ -2246,7 +2246,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { slice.m_computeDevice = m_computeDevice; slice.m_numRows = m_numRows; slice.m_numCols = numCols; - slice.m_nz = SecondaryIndexValueAt(startColumn + numCols) - SecondaryIndexValueAt(startColumn); + slice.m_nz = ( numCols == m_numCols ) ? m_nz : SecondaryIndexValueAt(startColumn + numCols) - SecondaryIndexValueAt(startColumn); slice.m_elemSizeAllocated = m_elemSizeAllocated; slice.m_totalBufferSizeAllocated = m_totalBufferSizeAllocated; slice.m_pArray = m_pArray;