Apply code review feedback

naiduv · Mar 3, 2016 · b2b6691 · b2b6691
1 parent 9c4ed05
commit b2b6691
Show file tree

Hide file tree

Showing 8 changed files with 49 additions and 49 deletions.
diff --git a/Source/ActionsLib/EvalActions.cpp b/Source/ActionsLib/EvalActions.cpp
@@ -59,7 +59,8 @@ static void DoEvalBase(const ConfigParameters& config, IDataReader& reader)
     size_t numMBsToShowResult = config(L"numMBsToShowResult", "100");
     size_t maxSamplesInRAM = config(L"maxSamplesInRAM", (size_t)SIZE_MAX);
     size_t numSubminiBatches = config(L"numSubminibatches", (size_t)1);
-    bool paralleltrain = config(L"parallelTrain", false);
+    //TODO: switch to a global parallel setting for both training and evaluation.
+    bool useParallel = config(L"parallelTrain", false);
 
     ConfigArray evalNodeNames = config(L"evalNodeNames", "");
     vector<wstring> evalNodeNamesVector;
@@ -70,7 +71,7 @@ static void DoEvalBase(const ConfigParameters& config, IDataReader& reader)
 
     auto net = ComputationNetwork::CreateFromFile<ElemType>(deviceId, modelPath);
 
-    SimpleEvaluator<ElemType> eval(net, numMBsToShowResult, traceLevel, maxSamplesInRAM, numSubminiBatches, paralleltrain);
+    SimpleEvaluator<ElemType> eval(net, numMBsToShowResult, traceLevel, maxSamplesInRAM, numSubminiBatches, useParallel);
     eval.Evaluate(&reader, evalNodeNamesVector, mbSize[0], epochSize);
 }
 
@@ -119,7 +120,8 @@ void DoCrossValidate(const ConfigParameters& config)
     size_t numMBsToShowResult = config(L"numMBsToShowResult", "100");
     size_t maxSamplesInRAM = config(L"maxSamplesInRAM", (size_t)SIZE_MAX);
     size_t numSubminiBatches = config(L"numSubminibatches", (size_t)1);
-    bool paralleltrain = config(L"parallelTrain", false);
+    //TODO: switch to a global parallel setting for both training and evaluation.
+    bool useParallel = config(L"parallelTrain", false);
 
     ConfigArray evalNodeNames = config(L"evalNodeNames", "");
     vector<wstring> evalNodeNamesVector;
@@ -153,7 +155,7 @@ void DoCrossValidate(const ConfigParameters& config)
         cvModels.push_back(cvModelPath);
         auto net = ComputationNetwork::CreateFromFile<ElemType>(deviceId, cvModelPath);
 
-        SimpleEvaluator<ElemType> eval(net, numMBsToShowResult, traceLevel, maxSamplesInRAM, numSubminiBatches, paralleltrain);
+        SimpleEvaluator<ElemType> eval(net, numMBsToShowResult, traceLevel, maxSamplesInRAM, numSubminiBatches, useParallel);
 
         fprintf(stderr, "model %ls --> \n", cvModelPath.c_str());
         auto evalErrors = eval.Evaluate(&cvDataReader, evalNodeNamesVector, mbSize[0], epochSize);

diff --git a/Source/CNTK/ModelEditLanguage.cpp b/Source/CNTK/ModelEditLanguage.cpp
@@ -596,7 +596,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
             case melPropBatchNormMode:
             {
                 bool evalMode = params[2];
-                netNdl->cn->SetBatchNormlizationNodesBelowEvalMode(evalMode, node);
+                netNdl->cn->SetBatchNormalizationNodesBelowEvalMode(evalMode, node);
                 break;
             }
             default:

diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h
@@ -328,7 +328,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb
     void AddFeatureNode(ComputationNodeBasePtr featureNode);
     void RemoveFeatureNode(ComputationNodeBasePtr featureNode);
     void SetLearnableNodesBelowLearningRateMultiplier(const float learningRateMultiplier, const ComputationNodeBasePtr& rootNode = nullptr);
-    void SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode = nullptr);
+    void SetBatchNormalizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode = nullptr);
 
     // -----------------------------------------------------------------------
     // node access

diff --git a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
@@ -323,7 +323,7 @@ void ComputationNetwork::SetLearnableNodesBelowLearningRateMultiplier(const floa
     }
 }
 
-void ComputationNetwork::SetBatchNormlizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode /* = nullptr */)
+void ComputationNetwork::SetBatchNormalizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode /* = nullptr */)
 {
     vector<ComputationNodeBasePtr> nodes;
     if (rootNode == nullptr)

diff --git a/Source/SGDLib/DataReaderHelpers.h b/Source/SGDLib/DataReaderHelpers.h
@@ -186,23 +186,27 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     }
 
     template<class ElemType>
-    static size_t GetNumSubminibatchNeeded(IDataReader<ElemType>* dataReader,
+    static size_t GetNumSubminibatchesNeeded(IDataReader<ElemType>* dataReader,
                                            size_t maxSamplesInRAM,
-                                           size_t numSubminiBatches,
+                                           size_t numSubminibatches,
                                            size_t tunedMBSize)
     {
-        if (numSubminiBatches > 1)
-            return numSubminiBatches;
+        if (numSubminibatches > 1) // user-specified maximum number of samples
+            return numSubminibatches;
 
         if (maxSamplesInRAM < SIZE_MAX)
         {
             // into how many pieces would we need to break the minibatch?
             // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed.
             size_t numParallelSequences = dataReader->GetNumParallelSequences();
             size_t estimatedMBSize = tunedMBSize * numParallelSequences;
-            return (size_t)std::ceil((float)estimatedMBSize / maxSamplesInRAM);
+            return (estimatedMBSize + maxSamplesInRAM - 1) / maxSamplesInRAM;
         }
 
+        // The return value of this method decides how many subminibatch needed for the training or 
+        // eval process. The current process only starts the subminibatch loop when the calculated
+        // subminibatch number is larger than 1. So here returning 0 or 1 shares the same behavior.
+        // But the default value should still be 0 which means no subminibatch needed for this case.
         return 0;
     }
 

diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
@@ -335,7 +335,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
 
         // set dropout rate for this epoch
         ComputationNetwork::SetDropoutRate<ElemType>(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed);
-        net->SetBatchNormlizationNodesBelowEvalMode(false, criterionNodes[0]);
+        net->SetBatchNormalizationNodesBelowEvalMode(false, criterionNodes[0]);
 
         // learning rate adjustment
         if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || i < m_learningRatesParam.size())
@@ -438,7 +438,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
         timer.Stop();
         double epochTime = timer.ElapsedSeconds();
 
-        net->SetBatchNormlizationNodesBelowEvalMode(true, criterionNodes[0]);
+        net->SetBatchNormalizationNodesBelowEvalMode(true, criterionNodes[0]);
 
         if (m_useEvalCriterionControlLR && epochEvalErrors.size() > 0)
         {
@@ -781,7 +781,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
     // prepare for sub-minibatching
     // Sub-minibatching is used if a single minibatch is too large to fit into GPU RAM.
     DataReaderHelpers::SubminibatchDispatcher<ElemType> smbDispatcher;
-    size_t numSubminibatchesNeeded = DataReaderHelpers::GetNumSubminibatchNeeded(trainSetDataReader, m_maxSamplesInRAM, m_numSubminiBatches, tunedMBSize);
+    size_t numSubminibatchesNeeded = DataReaderHelpers::GetNumSubminibatchesNeeded(trainSetDataReader, m_maxSamplesInRAM, m_numSubminiBatches, tunedMBSize);
     // this is non-trivial, we need a manager object to handle this
     if (numSubminibatchesNeeded > 1)
         smbDispatcher.Init(net, learnableNodes, criterionNodes, evaluationNodes);

diff --git a/Source/SGDLib/SimpleDistGradAggregator.h b/Source/SGDLib/SimpleDistGradAggregator.h
@@ -5,6 +5,7 @@
 #include <future>
 #include "GPUDataTransferer.h"
 #include "TimerUtility.h"
+#include "MatrixQuantizerImpl.h"
 
 namespace Microsoft { namespace MSR { namespace CNTK {
 

diff --git a/Source/SGDLib/SimpleEvaluator.h b/Source/SGDLib/SimpleEvaluator.h
@@ -13,7 +13,6 @@
 #include "ProgressTracing.h"
 #include "DistGradHeader.h"
 #include "IDistGradAggregator.h"
-#include "MatrixQuantizerImpl.h"
 #include "SimpleDistGradAggregator.h"
 
 #include <vector>
@@ -108,78 +107,72 @@ class SimpleEvaluator
         for (int i = 0; i < evalResults.size(); i++)
             evalResultsLastMBs.push_back((ElemType) 0);
 
+        //TODO: we should add support for distributed reading
         dataReader->StartMinibatchLoop(mbSize, 0, testSize);
         m_net->StartEvaluateMinibatchLoop(evalNodes);
 
         std::vector<Matrix<ElemType>*> learnParamsGradients;
         DataReaderHelpers::SubminibatchDispatcher<ElemType> smbDispatcher;
-        size_t numSubminibatchesNeeded = DataReaderHelpers::GetNumSubminibatchNeeded(dataReader, m_maxSamplesInRAM, m_numSubminiBatches, mbSize);
+        size_t numSubminibatchesNeeded = DataReaderHelpers::GetNumSubminibatchesNeeded(dataReader, m_maxSamplesInRAM, m_numSubminiBatches, mbSize);
 
         // Passing in two empty node lists so the dispatcher can work for the evalNodes.
         std::list<ComputationNodeBasePtr> learnableNodes;
         std::vector<ComputationNodeBasePtr> criterionNodes;
         if (numSubminibatchesNeeded > 1)
             smbDispatcher.Init(m_net, learnableNodes, criterionNodes, evalNodes);
 
-        bool noMoreSamplesToProcess = false;
-        for (;;)        
+        while (DataReaderHelpers::GetMinibatchIntoNetwork(*dataReader, m_net, nullptr, false, m_parallelRun, inputMatrices, actualMBSize))
         {
-            bool wasDataRead = DataReaderHelpers::GetMinibatchIntoNetwork(*dataReader, m_net, nullptr, false, m_parallelRun, inputMatrices, actualMBSize);
-            if (!wasDataRead && (!m_parallelRun || noMoreSamplesToProcess))
-                break;
-
-            if (!wasDataRead)
-                actualMBSize = 0;
-
-            if (actualMBSize > 0)
+            size_t actualNumSubminibatches = numSubminibatchesNeeded <= 1 ? 1 : smbDispatcher.GetMinibatchIntoCache(*dataReader, *m_net, inputMatrices, numSubminibatchesNeeded);
+            for (size_t ismb = 0; ismb < actualNumSubminibatches; ismb++)
             {
-                size_t actualNumSubminibatches = numSubminibatchesNeeded <= 1 ? 1 : smbDispatcher.GetMinibatchIntoCache(*dataReader, *m_net, inputMatrices, numSubminibatchesNeeded);
-                for (size_t ismb = 0; ismb < actualNumSubminibatches; ismb++)
+                if (actualNumSubminibatches > 1)
                 {
-                    if (actualNumSubminibatches > 1)
-                    {
-                        smbDispatcher.GetSubMinibatchToNet(ismb); // get sub-minibatch from full-size one
-                    }
-
-                    ComputationNetwork::BumpEvalTimeStamp(featureNodes);
-                    ComputationNetwork::BumpEvalTimeStamp(labelNodes);
+                    smbDispatcher.GetSubMinibatchToNet(ismb); // get sub-minibatch from full-size one
+                }
 
-                    m_net->ForwardProp(evalNodes);
+                ComputationNetwork::BumpEvalTimeStamp(featureNodes);
+                ComputationNetwork::BumpEvalTimeStamp(labelNodes);
 
-                    // house-keeping for sub-minibatching
-                    if (actualNumSubminibatches > 1)
-                        smbDispatcher.DoneWithCurrentSubMinibatch(ismb); // page state out
-                }                                                        // end sub-minibatch loop
+                m_net->ForwardProp(evalNodes);
 
+                // house-keeping for sub-minibatching
                 if (actualNumSubminibatches > 1)
-                    smbDispatcher.DoneWithCurrentMinibatch();
-            } // if (actualMBSize > 0)
+                    smbDispatcher.DoneWithCurrentSubMinibatch(ismb); // page state out
+            }                                                        // end sub-minibatch loop
+
+            if (actualNumSubminibatches > 1)
+                smbDispatcher.DoneWithCurrentMinibatch();
 
-            size_t numSamplesWithLabel = wasDataRead ? m_net->GetNumSamplesWithLabel(actualMBSize) : 0;
+            size_t numSamplesWithLabel = m_net->GetNumSamplesWithLabel(actualMBSize);
             size_t aggregateNumSamplesWithLabel = numSamplesWithLabel;
             if (m_parallelRun)
             {
                 if (m_gradHeader == nullptr)
                 {
                     m_gradHeader = DistGradHeader::Create(evalNodes.size());
-                    m_distGradAgg = new SimpleDistGradAggregator<ElemType>(g_mpi, false, m_traceLevel);
+                    m_distGradAgg = make_shared<SimpleDistGradAggregator<ElemType>>(g_mpi, false, m_traceLevel);
                 }
 
                 m_gradHeader->numEvalNode = evalNodes.size();
                 m_gradHeader->numSamples = actualMBSize;
                 m_gradHeader->numSamplesWithLabel = numSamplesWithLabel;
                 m_gradHeader->criterion = 0.0;
                 for (size_t i = 0; i < evalNodes.size(); i++)
-                    m_gradHeader->evalErrors[i] = actualMBSize > 0 ? evalNodes[i]->Get00Element() : 0.0;
+                    m_gradHeader->evalErrors[i] = evalNodes[i]->Get00Element();
 
+                // TODO: We are reusing the aggregation logic inside SimpleDistGradAggregator, which has a heavy dependency
+                // on the gradient matrix. At some point we should refacotr the aggregator class to be able to only calculating
+                // eval results and then remove this hack.
                 if (learnParamsGradients.size() == 0)
                 {
                     Matrix<ElemType>* matrix = new Matrix<ElemType>((DEVICEID_TYPE)m_net->GetDeviceId());
                     learnParamsGradients.push_back(matrix);
                 }
 
-                bool samplesProcessed = m_distGradAgg->AggregateGradients(learnParamsGradients, m_gradHeader, 0);
-                noMoreSamplesToProcess = !samplesProcessed;
+                // Using SimpleDistAggregator for eval results only. At some point we should rename the class to be just
+                // IDistAggregator and SimpleDistAggregator.
+                m_distGradAgg->AggregateGradients(learnParamsGradients, m_gradHeader, 0);
                 aggregateNumSamplesWithLabel = m_gradHeader->numSamplesWithLabel;
                 for (size_t i = 0; i < evalResults.size(); i++)
                     evalResults[i] += m_gradHeader->evalErrors[i];
@@ -294,7 +287,7 @@ class SimpleEvaluator
     size_t m_numSubminiBatches;
     bool m_parallelRun;
 
-    IDistGradAggregator<ElemType>* m_distGradAgg;
+    shared_ptr<IDistGradAggregator<ElemType>> m_distGradAgg;
     struct DistGradHeader* m_gradHeader;
     int m_traceLevel;
     void operator=(const SimpleEvaluator&); // (not assignable)