From 30f57f2cc843e44476f25617526c9f1c3791bc17 Mon Sep 17 00:00:00 2001 From: Chris Basoglu Date: Fri, 26 Jun 2015 09:52:36 -0700 Subject: [PATCH 1/6] Add dynamic minibatch sizing. --- Common/Include/DataReader.h | 23 +- Common/Include/commandArgUtil.h | 46 +- .../lyx/CNTKBook_CNTK_Chapter.lyx | 96 ++++ MachineLearning/CNTK/SGD.h | 489 ++++++++++++++---- 4 files changed, 526 insertions(+), 128 deletions(-) diff --git a/Common/Include/DataReader.h b/Common/Include/DataReader.h index 79909331a1db..709b49f374b8 100644 --- a/Common/Include/DataReader.h +++ b/Common/Include/DataReader.h @@ -29,9 +29,16 @@ namespace Microsoft { namespace MSR { namespace CNTK { -const size_t randomizeAuto = ((size_t)-1)>>2; // randomize range set automatically, parameter value for Init() -const size_t randomizeNone = 0; // don't randomize, parameter value for Init() -const size_t requestDataSize = randomizeAuto; // StartMinibatchLoop default parameter, sets number of requested frames equal to the number of frames in the dataset +// randomize range set automatically, parameter value for Init() +const size_t randomizeAuto = ((size_t) -1) >> 2; + +// don't randomize, parameter value for Init() +const size_t randomizeNone = 0; + +// StartMinibatchLoop default parameter, sets number of requested +// frames equal to the constant 3fffffffffffffff computed by ((size_t) -1) >> 2 above. +// We use this constant as a stand in for the total number of frames in the dataset. +const size_t requestDataSize = randomizeAuto; enum EndDataType { @@ -52,7 +59,7 @@ class DATAREADER_API IDataReader virtual void Init(const ConfigParameters& config) = 0; virtual void Destroy() = 0; - virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize) = 0; + virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize) = 0; virtual bool GetMinibatch(std::map*>& matrices) = 0; virtual size_t NumberSlicesInEachRecurrentIter() = 0; virtual void SetNbrSlicesEachRecurrentIter(const size_t) = 0; @@ -80,7 +87,7 @@ class DataReader : public IDataReader, protected Plugin typedef typename IDataReader::LabelType LabelType; typedef typename IDataReader::LabelIdType LabelIdType; private: - IDataReader *m_dataReader; // reader + IDataReader* m_dataReader; // reader // Init - Reader Initialize for multiple data sets // config - [in] configuration parameters for the datareader @@ -123,7 +130,7 @@ class DataReader : public IDataReader, protected Plugin // mbSize - [in] size of the minibatch (number of frames, etc.) // epoch - [in] epoch number for this loop // requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset - virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize); + virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize); // GetMinibatch - Get the next minibatch (features and labels) // matrices - [in] a map with named matrix types (i.e. 'features', 'labels') mapped to the corresponing matrix, @@ -152,10 +159,10 @@ class DataReader : public IDataReader, protected Plugin // [out] size of buffer filled with data // recordStart - record to start reading from, defaults to zero (start of data) // returns: true if data remains to be read, false if the end of data was reached - virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0); + virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart = 0); virtual bool DataEnd(EndDataType endDataType); - void SetSentenceEndInBatch(std::vector &sentenceEnd); + void SetSentenceEndInBatch(std::vector& sentenceEnd); }; }}} diff --git a/Common/Include/commandArgUtil.h b/Common/Include/commandArgUtil.h index 507a4cb1240c..c54a65fcd393 100644 --- a/Common/Include/commandArgUtil.h +++ b/Common/Include/commandArgUtil.h @@ -612,6 +612,7 @@ class ConfigParser // pop out of content level contentLevel = false; } + if (quoteFound) { // skip the closing quote @@ -660,7 +661,7 @@ class ConfigParser std::string ReadConfigFiles(const std::string& filePaths); std::string ReadConfigFiles(const std::wstring& filePaths); std::string ResolveIncludeStatements(const std::string& configString, std::vector& resolvedConfigFiles); - void LoadConfigFile(const std::wstring & filePath); + void LoadConfigFile(const std::wstring& filePath); void LoadConfigFileAndResolveVariables(const std::wstring& filePath, const ConfigParameters& config); void LoadConfigFiles(const std::wstring& filePaths, const std::string* configStringToAppend = nullptr); @@ -873,17 +874,17 @@ class ConfigParameters: public ConfigParser, public ConfigDictionary } // Insert - insert an 'name=value' string into the dictionary - void Insert(const std::string &str) + void Insert(const std::string& str) { ParseValue(str, 0, str.length()); } - bool Exists(const std::wstring & name) const + bool Exists(const std::wstring& name) const { return Exists(msra::strfun::utf8(name)); } - bool Exists(const std::string & name) const + bool Exists(const std::string& name) const { if (find(name) != end()) { @@ -899,42 +900,42 @@ class ConfigParameters: public ConfigParser, public ConfigDictionary } // ExistsCurrent - check to see if a key exists in THIS config, don't check parent - bool ExistsCurrent(const std::string & name) const + bool ExistsCurrent(const std::string& name) const { return (find(name) != end()); } // dict(name, default) for strings - ConfigValue operator()(const std::wstring & name, - const wchar_t *defaultvalue) const + ConfigValue operator()(const std::wstring& name, + const wchar_t* defaultvalue) const { return operator()(msra::strfun::utf8(name), defaultvalue); } // dict(name, default) for strings - ConfigValue operator()(const std::string & name, - const wchar_t *defaultvalue) const + ConfigValue operator()(const std::string& name, + const wchar_t* defaultvalue) const { return operator()(name, msra::strfun::utf8(defaultvalue).c_str()); } // dict(name, default) for strings - ConfigValue operator()(const std::wstring & name, - const char *defaultvalue) const + ConfigValue operator()(const std::wstring& name, + const char* defaultvalue) const { return operator()(msra::strfun::utf8(name), defaultvalue); } // dict(name, default) for strings - ConfigValue operator()(const std::string & name, - const char *defaultvalue) const + ConfigValue operator()(const std::string& name, + const char* defaultvalue) const { ConfigValue value = Find(name, defaultvalue); return value; } - ConfigValue Find(const std::string & name, - const char *defaultvalue = NULL) const + ConfigValue Find(const std::string& name, + const char* defaultvalue = NULL) const { auto iter = find(name); ConfigValue result; @@ -975,10 +976,11 @@ class ConfigParameters: public ConfigParser, public ConfigDictionary // any whitespace characters. If an opening "$" is found without a closing "$", an exception is thrown. // configString - the string that you would like to resolve variables in. // returns: A copy of 'configString' with all the variables resolved. - std::string ResolveVariablesInSingleLine(const std::string &configLine) const + std::string ResolveVariablesInSingleLine(const std::string& configLine) const { // ensure that this method was called on a single line (eg, no newline characters exist in 'configLine'). - if (configLine.find_first_of("\n") != std::string::npos) { + if (configLine.find_first_of("\n") != std::string::npos) + { throw std::logic_error( "\"ResolveVariablesInSingleLine\" shouldn't be called with a string containing a newline character"); } @@ -1053,7 +1055,7 @@ class ConfigParameters: public ConfigParser, public ConfigDictionary // we shouldn't insert newlines where they didn't already exist. // configString - the string that you would like to resolve variables in. // returns: A copy of 'configString' with all the variables resolved. - std::string ResolveVariables(const std::string &configString) const + std::string ResolveVariables(const std::string& configString) const { std::string newConfigString; if (configString.find_first_of("\n") != std::string::npos) @@ -1347,14 +1349,14 @@ class argvector: public std::vector RuntimeError("argvector: invalid arg value"); } } - static void parse(const std::wstring & in, std::wstring & val) + static void parse(const std::wstring& in, std::wstring& val) { val = in; } public: // constructor --construct empty, then assign a wstring from command-line argument - void operator=(const std::wstring & arg) + void operator=(const std::wstring& arg) { clear(); // separate the arguments @@ -1387,7 +1389,7 @@ class argvector: public std::vector } // constructor --use this for setting default values - argvector(const std::wstring & arg) + argvector(const std::wstring& arg) { *this = arg; } @@ -1438,7 +1440,7 @@ class argvector: public std::vector } // we give full read access to the vector, so we can use it bounded as well - const std::vector & tovector() const + const std::vector& tovector() const { return *this; } diff --git a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx index 9779be49f391..41ea99898cdf 100644 --- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx +++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx @@ -1725,6 +1725,102 @@ numBestSearchEpoch \end_layout +\begin_layout Standard +Used in the Adaptive Minibatch Sizing mode. +\end_layout + +\begin_layout Itemize + +\emph on +numMiniBatch4LRSearch +\emph default + +\begin_inset Index idx +status open + +\begin_layout Plain Layout +numMiniBatch4LRSearch +\end_layout + +\end_inset + +: the number of minibatches used to search the minibatch size when +in adaptive minibatch size mode. + Default value is 500. + It's typically set to 10-20% of the total minibatches in an epoch +this is shared with the search for learning rate in +SearchBeforeEpoch mode. + +\end_layout + +\begin_layout Itemize + +\emph on +autoAdjustMinibatch +\emph default + +\begin_inset Index idx +status open + +\begin_layout Plain Layout +autoAdjustMinibatch +\end_layout + +\end_inset + +: enable or disable whether minibatch size is adaptively adjusted. + Default value is false. +Adapative minibatch sizing will begin on +epochs starting after user minbatch sizes expcitily +specified are complete. For example if the user +specifed minibatchSize=256:1024, then 256 and 1024 +are used in the first 2 Epochs and adaptive minibatch +sizing is used aferwards + +\end_layout + +\begin_layout Itemize + +\emph on +minibatchSizeTuningFrequency +\emph default + +\begin_inset Index idx +status open + +\begin_layout Plain Layout +minibatchSizeTuningFrequency +\end_layout + +\end_inset + +: The number of epochs to skip, on a periodic basis, before +dynamically adjusting the minibatch size. + Default value is 1. + +\end_layout + +\begin_layout Itemize + +\emph on +minibatchSizeTuningMax +\emph default + +\begin_inset Index idx +status open + +\begin_layout Plain Layout +minibatchSizeTuningMax +\end_layout + +\end_inset + +: The maximum size allowed for an +adaptively adjusted minibatch size. + Default value is 1048576. + +\end_layout + \end_deeper \begin_layout Subsubsection Gradient control diff --git a/MachineLearning/CNTK/SGD.h b/MachineLearning/CNTK/SGD.h index b91352f452d8..266b8331b929 100644 --- a/MachineLearning/CNTK/SGD.h +++ b/MachineLearning/CNTK/SGD.h @@ -126,7 +126,7 @@ typedef struct stGradientUpdateInfo } GradientUpdateInfo; template -class SGD: ComputationNetworkHelper +class SGD : ComputationNetworkHelper { protected: typedef ComputationNetworkHelper B; @@ -157,17 +157,31 @@ class SGD: ComputationNetworkHelper ElemType learnRateDecreaseFactor = configAALR("learnRateDecreaseFactor", "0.618"); ElemType increaseLearnRateIfImproveMoreThan = configAALR("increaseLearnRateIfImproveMoreThan", "1#INF"); ElemType learnRateIncreaseFactor = configAALR("learnRateIncreaseFactor", "1.382"); + + // AutoAdjust Auto Adjust Minibatch Parameters + bool autoAdjustMinibatch = (bool) configAALR("autoAdjustMinibatch", "false"); + size_t minibatchSizeTuningFrequency = configAALR("minibatchSizeTuningFrequency", "1"); + size_t minibatchSizeTuningMax = configAALR("minibatchSizeTuningMax", "1048576"); + + // the number of minibatches used to search + // the learning rate. It’s typically set to 10-20% of + // the total minibatches in an epoch. ConfigArray minibatch4LRSearch = configAALR("numMiniBatch4LRSearch", "500"); intargvector numMiniBatch4LRSearch = minibatch4LRSearch; + size_t numPrevLearnRates = configAALR("numPrevLearnRates", "5"); size_t numBestSearchEpoch = configAALR("numBestSearchEpoch", "1"); bool loadBestModel = configAALR("loadBestModel", "true"); ConfigArray minibatchSize = configSGD("minibatchSize", "256"); intargvector mbSize = minibatchSize; + + // the number of samples in each epoch (0 means, use all the samples in each epoch). size_t epochSize = configSGD("epochSize", "0"); + // the total number of epochs to run. size_t maxEpochs = configSGD("maxEpochs"); + ConfigArray momentumPerMBStr = configSGD("momentumPerMB", ""); floatargvector momentumPerMB = momentumPerMBStr; @@ -240,7 +254,8 @@ class SGD: ComputationNetworkHelper trainCriterionNodeName, evalCriterionNodeName, doGradientCheck, gradientCheckSigDigit, validateAfterModelReloading, rpi, learnRateAdjustInterval, UsingAllDataForPreComputedNode, - needAveMultiplier, L2RegWeight, L1RegWeight); + needAveMultiplier, L2RegWeight, L1RegWeight, + autoAdjustMinibatch, minibatchSizeTuningFrequency, minibatchSizeTuningMax); } void setMomentum(float momentum) @@ -287,15 +302,27 @@ class SGD: ComputationNetworkHelper const bool UsingAllDataForPreComputed = true, const bool needAveMultiplier = true, const ElemType L2RegWeight = 0, - const ElemType L1RegWeight = 0) + const ElemType L1RegWeight = 0, + const bool autoAdjustMinibatch = false, + const size_t minibatchSizeTuningFrequency = 1, + const size_t minibatchSizeTuningMax = 1048576) { m_numPrevLearnRates = numPrevLearnRates; + m_prevChosenMinibatchSize = 0; + m_autoAdjustMinibatch = autoAdjustMinibatch; + m_minibatchSizeTuningMax = minibatchSizeTuningMax; + m_minibatchSizeTuningFrequency = minibatchSizeTuningFrequency; + m_mbSize = mbSize; + + // the number of samples in each epoch (0 means, use all the samples in each epoch). m_epochSize = epochSize; if (m_epochSize == 0) { m_epochSize = requestDataSize; } + + // the total number of epochs to run. m_maxEpochs = maxEpochs; m_gradientClippingWithTruncation = gradientClippingWithTruncation; @@ -346,7 +373,8 @@ class SGD: ComputationNetworkHelper (learningRatesPerSample.size() == 0 && learningRatesPerMB.size() == 0)) { throw std::invalid_argument( - "If autoLearnRateSearchType is false you must specify the learningRatesPerSample or learningRatesPerMB parameter."); + "If autoLearnRateSearchType is false you must specify the " + "learningRatesPerSample or learningRatesPerMB parameter."); } if (learningRatesPerSample.size() > 0 && learningRatesPerMB.size() > 0) @@ -368,6 +396,7 @@ class SGD: ComputationNetworkHelper } m_needToNormalizeLRByParallUtterance = true; } + m_momentumPerMB = 0.9f; if (momentumPerMB.size() > 0) { @@ -550,8 +579,8 @@ class SGD: ComputationNetworkHelper IDataReader* trainSetDataReader, IDataReader* validationSetDataReader) { - std::vector & FeatureNodes = net.FeatureNodes(); - std::vector & labelNodes = net.LabelNodes(); + std::vector& FeatureNodes = net.FeatureNodes(); + std::vector& labelNodes = net.LabelNodes(); std::vector criterionNodes = GetTrainCriterionNodes(net); std::vector evaluationNodes = GetEvalCriterionNodes(net); @@ -610,6 +639,7 @@ class SGD: ComputationNetworkHelper size_t totalSamplesSeen = 0; ElemType learnRatePerSample = 0.5f / m_mbSize[startEpoch]; + float learningRateAdjustmentFactor = 1.0f; vector prevLearnRates; prevLearnRates.resize(m_numPrevLearnRates); for (int i = 0; i < m_numPrevLearnRates; i++) @@ -640,7 +670,7 @@ class SGD: ComputationNetworkHelper if (startEpoch > 0) { learnRateInitialized = LoadCheckPointInfo(startEpoch - 1, totalSamplesSeen, - learnRatePerSample, smoothedGradients, prevCriterion); + learnRatePerSample, smoothedGradients, prevCriterion, m_prevChosenMinibatchSize); if (learnRateInitialized) { prevLearnRates[startEpoch % m_numPrevLearnRates] = learnRatePerSample; @@ -653,7 +683,8 @@ class SGD: ComputationNetworkHelper !learnRateInitialized && m_learningRatesPerSample.size() <= startEpoch) { throw std::invalid_argument( - "When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, or an explicit learning rate must be specified in config for the starting epoch."); + "When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, " + "or an explicit learning rate must be specified in config for the starting epoch."); } unsigned long dropOutSeed = 1; @@ -667,17 +698,18 @@ class SGD: ComputationNetworkHelper SetMaxTempMemSizeForCNN(refNet, refNode, m_maxTempMemSizeInSamplesForCNN); } - for (int i = int(startEpoch); i < int(m_maxEpochs); i++) + for (int i = startEpoch; i < (int) m_maxEpochs; i++) { auto t_start_epoch = Timer::MilliSecondElapsed(); - //set dropout rate + // set dropout rate SetDropoutRate(net, criterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed); setMomentum(m_momentumInputPerMB[i]); - //learning rate adjustment - if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i)) + // learning rate adjustment + if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || + (m_learningRatesPerSample.size() > 0 && m_learningRatesPerSample.size() > i)) { learnRatePerSample = m_learningRatesPerSample[i]; } @@ -689,14 +721,16 @@ class SGD: ComputationNetworkHelper largestPrevLearnRatePerSample = max(largestPrevLearnRatePerSample, prevLearnRates[j]); } - //return a reasonable learning rate based on the initial mbsize - learnRatePerSample = SearchLearnRateBeforeEpoch(net, refNet, refNode, i, learnRatePerSample, - trainSetDataReader, FeatureNodes, labelNodes, - criterionNodes, evaluationNodes, inputMatrices, - learnableNodes, smoothedGradients, learnRateInitialized, - largestPrevLearnRatePerSample); + // return a reasonable learning rate based on the initial minibatchSize + ElemType newLearningRatePerSample = SearchForBestLearnRate(net, refNet, refNode, i, learnRatePerSample, + trainSetDataReader, FeatureNodes, labelNodes, + criterionNodes, evaluationNodes, inputMatrices, + learnableNodes, smoothedGradients, + learnRateInitialized, largestPrevLearnRatePerSample); + learningRateAdjustmentFactor = newLearningRatePerSample / learnRatePerSample; + learnRatePerSample = newLearningRatePerSample; - //save per sample learn rate to support changeable mbsize + // save per sample learn rate to support changeable minibatchSize prevLearnRates[i % m_numPrevLearnRates] = learnRatePerSample; } @@ -704,8 +738,7 @@ class SGD: ComputationNetworkHelper if (learnRatePerSample < m_minLearnRate) { - fprintf(stderr, - "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n", + fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n", i + 1, learnRatePerSample, m_minLearnRate); if (m_autoLearnRateSearchType != LearningRateSearchAlgorithm::None) { @@ -718,10 +751,40 @@ class SGD: ComputationNetworkHelper INT32 mySamples = (INT32) #endif fprintf(stderr, "Starting Epoch %d: learning rate per sample = %f momentum = %f \n", - i, learnRatePerSample, m_momentumPerMB); + i + 1, learnRatePerSample, m_momentumPerMB); + + size_t chosenMinibatchSize; + + // Through the command line or config file the user can set minibatch sizes on a per epoch + // basis for a set number of epochs. For epochs after that point, m_mbSize.size(), either + // we just keep using + // the last minibatch size, or we use tuning to try and find a better one. + if (m_autoAdjustMinibatch && i >= m_mbSize.size()) + { + size_t numFramesToUseInSearch = m_numMiniBatch4LRSearch[i] * m_mbSize[i]; + if (m_epochSize != requestDataSize) + { + // ensure the numFramesToUseInSearch does not exceed the total number of frames in the epoch + numFramesToUseInSearch = min(numFramesToUseInSearch, m_epochSize); + } + + // Use tuning to try and find a better minibatch size + chosenMinibatchSize = AdaptiveMinibatchSizing(net, refNet, refNode, i, + numFramesToUseInSearch, + trainSetDataReader, learnRatePerSample, + m_mbSize[i], FeatureNodes, labelNodes, + criterionNodes, evaluationNodes, + inputMatrices, learnableNodes, + smoothedGradients, learningRateAdjustmentFactor); + } + else + { + // use the explicitly set minibatch size + chosenMinibatchSize = m_mbSize[i]; + } TrainOneEpoch(net, refNet, refNode, i, m_epochSize, - trainSetDataReader, learnRatePerSample, FeatureNodes, + trainSetDataReader, learnRatePerSample, chosenMinibatchSize, FeatureNodes, labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, epochCriterion, epochEvalErrors, totalSamplesSeen); @@ -730,23 +793,23 @@ class SGD: ComputationNetworkHelper ElemType epochTime = (t_end_epoch - t_start_epoch) / ElemType(MS_PER_SEC); fprintf(stderr, - "Finished Epoch[%d]: [Training Set] Train Loss Per Sample = %.8g ", + "Finished Epoch[%d]: [Training Set] TrainLossPerSample = %.8g; ", i + 1, epochCriterion); if (epochEvalErrors.size() == 1) { fprintf(stderr, - "EvalErr Per Sample = %.8g Ave Learn Rate Per Sample = %.10g Epoch Time=%.8g\n", + "EvalErrPerSample = %.8g; Ave LearnRatePerSample = %.10g; EpochTime=%.8g\n", epochEvalErrors[0], learnRatePerSample, epochTime); } else { - fprintf(stderr, "EvalErr Per Sample "); + fprintf(stderr, "EvalErrPerSample "); for (size_t j = 0; j < epochEvalErrors.size(); j++) { - fprintf(stderr, "[%lu]=%.8g ", j, epochEvalErrors[j]); + fprintf(stderr, "[%lu]=%.8g; ", j, epochEvalErrors[j]); } - fprintf(stderr, "Ave Learn Rate Per Sample = %.10g Epoch Time=%.8g\n", + fprintf(stderr, "Ave LearnRatePerSample = %.10g; Epoch Time=%.8g\n", learnRatePerSample, epochTime); fprintf(stderr, "Finished Epoch[%d]: Criterion Node [%ls] Per Sample = %.8g\n", i + 1, criterionNodes[0]->NodeName().c_str(), epochCriterion); @@ -810,7 +873,7 @@ class SGD: ComputationNetworkHelper cvSetTrainAndEvalNodes.push_back(evaluationNodes[0]->NodeName()); vector vScore = evalforvalidation.Evaluate(*validationSetDataReader, cvSetTrainAndEvalNodes, m_mbSize[i]); - fprintf(stderr, "Finished Epoch[%d]: [Validation Set] Train Loss Per Sample = %.8g EvalErr Per Sample = %.8g\n", + fprintf(stderr, "Finished Epoch[%d]: [Validation Set] TrainLossPerSample = %.8g; EvalErrPerSample = %.8g\n", i + 1, vScore[0], vScore[1]); epochCriterion = vScore[0]; //the first one is the training criterion. @@ -847,7 +910,7 @@ class SGD: ComputationNetworkHelper net.ResetEvalTimeStamp(); LoadCheckPointInfo(i - 1, totalSamplesSeen, learnRatePerSample, - smoothedGradients, prevCriterion); + smoothedGradients, prevCriterion, m_prevChosenMinibatchSize); fprintf(stderr, "Loaded the previous model which has better training criterion.\n"); loadedPrevModel = true; } @@ -911,7 +974,7 @@ class SGD: ComputationNetworkHelper if (mpiRank == 0) { net.SaveToFile(GetModelNameForEpoch(i)); - SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion); + SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, chosenMinibatchSize); if (!m_keepCheckPointFiles) { //delete previous checkpiont file to save space @@ -919,7 +982,8 @@ class SGD: ComputationNetworkHelper } } - if (learnRatePerSample < 1e-12) { + if (learnRatePerSample < 1e-12) + { fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n", learnRatePerSample); } @@ -938,8 +1002,7 @@ class SGD: ComputationNetworkHelper } protected: - - //return true if precomputation is executed. + // return true if precomputation is executed. bool PreCompute(ComputationNetwork& net, IDataReader* trainSetDataReader, std::vector& FeatureNodes, @@ -965,7 +1028,8 @@ class SGD: ComputationNetworkHelper //trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0 , requestDataSize); // trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0 , m_epochSize); // only based on one epoch // [1/12/2015 erw] to support large dataset, we usually paritition whole dataset into several epoches, so we need to use all the data to do precomputing - if (m_useAllDataForPreComputedNode) { + if (m_useAllDataForPreComputedNode) + { // using all the data trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0); } @@ -1000,21 +1064,21 @@ class SGD: ComputationNetworkHelper return true; } - //return a reasonable initial learning rate based on the initial mbsize - ElemType SearchLearnRateBeforeEpoch(ComputationNetwork& net, - ComputationNetwork& refNet, - const ComputationNodePtr refNode, const int epochNumber, - const ElemType curLearnRate, - IDataReader* trainSetDataReader, - const std::vector& FeatureNodes, - const std::vector& labelNodes, - const std::vector& criterionNodes, - const std::vector& evaluationNodes, - std::map*>& inputMatrices, - const std::list& learnableNodes, - std::list>& smoothedGradients, - const bool learnRateInitialized, - const ElemType largestPrevLearnRatePerSample) + // return a reasonable initial learning rate based on the initial mbsize + ElemType SearchForBestLearnRate(ComputationNetwork& net, + ComputationNetwork& refNet, + const ComputationNodePtr refNode, const int epochNumber, + const ElemType curLearnRate, + IDataReader* trainSetDataReader, + const std::vector& FeatureNodes, + const std::vector& labelNodes, + const std::vector& criterionNodes, + const std::vector& evaluationNodes, + std::map*>& inputMatrices, + const std::list& learnableNodes, + std::list>& smoothedGradients, + const bool learnRateInitialized, + const ElemType largestPrevLearnRatePerSample) { ElemType epochCriterion = std::numeric_limits::infinity(); ElemType prevCriterion = std::numeric_limits::infinity(); @@ -1023,10 +1087,11 @@ class SGD: ComputationNetworkHelper size_t totalSamplesSeen = 0; ElemType bestLearnRatePerSample = curLearnRate; - size_t epochSize = m_numMiniBatch4LRSearch[epochNumber] * m_mbSize[epochNumber]; + size_t numFramesToUseInSearch = m_numMiniBatch4LRSearch[epochNumber] * m_mbSize[epochNumber]; if (m_epochSize != requestDataSize) { - epochSize = min(epochSize, m_epochSize); //use a small number minibatches to make decision + // ensure the numFramesToUseInSearch does not exceed the total number of frames in the epoch + numFramesToUseInSearch = min(numFramesToUseInSearch, m_epochSize); } ElemType baseCriterion; @@ -1045,17 +1110,18 @@ class SGD: ComputationNetworkHelper net.ResetEvalTimeStamp(); ElemType learnRate = learnRatePerSample; + size_t dummyMinibatchSize = 0; LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, learnRate, - smoothedGradients, prevCriterion); + smoothedGradients, prevCriterion, dummyMinibatchSize); - //if model is not changed this is what we will get + // if model is not changed this is what we will get TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, - epochSize, trainSetDataReader, 0, + numFramesToUseInSearch, trainSetDataReader, 0, m_mbSize[epochNumber], FeatureNodes, labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, baseCriterion, - epochEvalErrors, totalSamplesSeen); + epochEvalErrors, totalSamplesSeen, "BaseAdaptiveLearnRateSearch:"); if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch) { @@ -1068,7 +1134,7 @@ class SGD: ComputationNetworkHelper if (m_epochSize != requestDataSize) { - ratio = pow(((ElemType) epochSize) / m_epochSize, 1.0f / 2); + ratio = pow(((ElemType) numFramesToUseInSearch) / m_epochSize, 1.0f / 2); } baseCriterion = max(ratio * prevCriterion + (1 - ratio) * baseCriterion, baseCriterion); @@ -1078,13 +1144,13 @@ class SGD: ComputationNetworkHelper { learnRatePerSample *= 0.618f; TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, - epochSize, trainSetDataReader, - learnRatePerSample, FeatureNodes, + numFramesToUseInSearch, trainSetDataReader, + learnRatePerSample, m_mbSize[epochNumber], FeatureNodes, labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, epochCriterion, epochEvalErrors, - totalSamplesSeen); + totalSamplesSeen, "AdaptiveLearnRateSearch:"); } while (std::isnan(epochCriterion) || (epochCriterion > baseCriterion && learnRatePerSample > minLearnRate)); @@ -1099,13 +1165,13 @@ class SGD: ComputationNetworkHelper ElemType leftCriterion, rightCriterion = epochCriterion; TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, - epochSize, trainSetDataReader, - leftLearnRatePerSample, + numFramesToUseInSearch, trainSetDataReader, + leftLearnRatePerSample, m_mbSize[epochNumber], FeatureNodes, labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, leftCriterion, - epochEvalErrors, totalSamplesSeen); + epochEvalErrors, totalSamplesSeen, "DetailBaseAdaptiveLearnRateSearch:"); while (rightLearnRatePerSample > leftLearnRatePerSample * 1.2f) { @@ -1114,9 +1180,9 @@ class SGD: ComputationNetworkHelper rightLearnRatePerSample *= 0.618f; TrainOneMiniEpochAndReloadModel(net, refNet, refNode, - epochNumber, epochSize, + epochNumber, numFramesToUseInSearch, trainSetDataReader, - rightLearnRatePerSample, + rightLearnRatePerSample, m_mbSize[epochNumber], FeatureNodes, labelNodes, criterionNodes, evaluationNodes, @@ -1125,16 +1191,16 @@ class SGD: ComputationNetworkHelper smoothedGradients, rightCriterion, epochEvalErrors, - totalSamplesSeen); + totalSamplesSeen, "DetailRightAdaptiveLearnRateSearch:"); } else { leftLearnRatePerSample /= 0.618f; TrainOneMiniEpochAndReloadModel(net, refNet, refNode, - epochNumber, epochSize, + epochNumber, numFramesToUseInSearch, trainSetDataReader, - leftLearnRatePerSample, + leftLearnRatePerSample, m_mbSize[epochNumber], FeatureNodes, labelNodes, criterionNodes, evaluationNodes, @@ -1143,7 +1209,7 @@ class SGD: ComputationNetworkHelper smoothedGradients, leftCriterion, epochEvalErrors, - totalSamplesSeen); + totalSamplesSeen, "DetailLeftAdaptiveLearnRateSearch:"); } } @@ -1162,55 +1228,252 @@ class SGD: ComputationNetworkHelper const ComputationNodePtr refNode, const int epochNumber, const size_t epochSize, IDataReader* trainSetDataReader, const ElemType learnRatePerSample, + const size_t minibatchSize, const std::vector& FeatureNodes, const std::vector& labelNodes, const std::vector& criterionNodes, const std::vector& evaluationNodes, std::map*>& inputMatrices, const std::list& learnableNodes, - std::list>& smoothedGradients, - ElemType& epochCriterion, std::vector& epochEvalErrors, - size_t& totalSamplesSeen) + /*out*/ std::list>& smoothedGradients, + /*out*/ ElemType& epochCriterion, std::vector& epochEvalErrors, + /*out*/ size_t& totalSamplesSeen, + std::string prefixMsg = "") { TrainOneEpoch(net, refNet, refNode, epochNumber, epochSize, - trainSetDataReader, learnRatePerSample, FeatureNodes, + trainSetDataReader, learnRatePerSample, minibatchSize, FeatureNodes, labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, - epochCriterion, epochEvalErrors, totalSamplesSeen); - fprintf(stderr, "Finished Mini-Epoch For LearnRate Selection: Train Loss Per Sample = %.8g ", + /*out*/ epochCriterion, /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen, + prefixMsg); + + fprintf(stderr, "Finished Mini-Epoch For LearnRate Selection: TrainLossPerSample = %.8g;", epochCriterion); if (epochEvalErrors.size() == 1) { - fprintf(stderr, "EvalErr Per Sample = %.8g Ave Learn Rate Per Sample = %.10g\n", + fprintf(stderr, "EvalErrPerSample = %.8g; Ave LearnRatePerSample = %.10g\n", epochEvalErrors[0], learnRatePerSample); } else { - fprintf(stderr, "EvalErr Per Sample "); + fprintf(stderr, "EvalErrPerSample "); for (size_t i = 0; i < epochEvalErrors.size(); i++) { - fprintf(stderr, "[%lu] = %.8g ", i, epochEvalErrors[i]); + fprintf(stderr, "[%lu] = %.8g; ", i, epochEvalErrors[i]); } - fprintf(stderr, "Ave Learn Rate Per Sample = %.10g\n", learnRatePerSample); + fprintf(stderr, "Ave LearnRatePerSample = %.10g\n", learnRatePerSample); } int baseModelEpoch = epochNumber - 1; net.LoadPersistableParametersFromFile(GetModelNameForEpoch(baseModelEpoch), m_validateAfterModelReloading); net.ResetEvalTimeStamp(); - ElemType learnRate; - ElemType prevCriterion; - LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, learnRate, - smoothedGradients, prevCriterion); + ElemType dummyLearnRate; + ElemType dummtPrevCriterion; + size_t dummyMinibatchSize = 0; + LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, dummyLearnRate, + smoothedGradients, dummtPrevCriterion, dummyMinibatchSize); + } + + size_t AdaptiveMinibatchSizing(ComputationNetwork& net, + ComputationNetwork& refNet, + const ComputationNodePtr refNode, + const int epochNumber, + const size_t numFramesToUseInSearch, + const IDataReader* trainSetDataReader, + const ElemType learnRatePerSample, + const size_t initialMinibatchSize, + const std::vector& FeatureNodes, + const std::vector& labelNodes, + const std::vector& criterionNodes, + const std::vector& evaluationNodes, + std::map*>& inputMatrices, + const std::list& learnableNodes, + std::list>& smoothedGradients, + const float learningRateAdjustmentFactor) + { + size_t minMinibatchSize = initialMinibatchSize; + size_t chosenMinibatchSize = initialMinibatchSize; + + // do some pre-adjustment based on LR + // Basically we assume that the LR for epoch 1 is safe for mbsize. + // If LR control led to a smaller LR, then we can safely increase the lower bound of the MB size. + float learningRateChangeSoFar = m_learningRatesPerSample[epochNumber] / m_learningRatesPerSample[0]; + learningRateChangeSoFar *= learningRateAdjustmentFactor; + + // increasing by the full factor is found to be too aggressive; sqrt() seems more robust + learningRateChangeSoFar = sqrt(learningRateChangeSoFar); + + // LR was indeed reduced + if (learningRateChangeSoFar < 1.0f) + { + // we can safely increase MB size (note: this may be bigger than our max) + minMinibatchSize = (size_t) (minMinibatchSize / learningRateChangeSoFar); + } + + if (epochNumber < 2 && m_prevChosenMinibatchSize != 0) + { + // newly started training: any previous MB size stored in the model is to be ignored + fprintf (stderr, "before epoch .2, previous minibatchSize %d is " + "considered invalid -> resetting\n", m_prevChosenMinibatchSize); + m_prevChosenMinibatchSize = 0; + } + + // check if we need to skip + if (m_prevChosenMinibatchSize != 0 && + (epochNumber + 1) > m_minibatchSizeTuningFrequency && + (epochNumber + 1) % m_minibatchSizeTuningFrequency != 0) + { + fprintf(stderr, "AdaptiveMinibatchSearch: Search for a better minibatchSize " + "in epoch %d skipped, keeping minibatchSize of %d\n", + epochNumber + 1, m_prevChosenMinibatchSize); + chosenMinibatchSize = m_prevChosenMinibatchSize; + } + else + { + if (m_prevChosenMinibatchSize != 0) + { + // but we don't go lower than 0.5 * the chosen previous one + fprintf(stderr, "AdaptiveMinibatchSearch: Limiting minMinibatchSize to " + "previous minibatchSize = (%d / 2)\n", m_prevChosenMinibatchSize); + minMinibatchSize = max(minMinibatchSize, m_prevChosenMinibatchSize / 2); + } + + size_t maxMinibatchSize = m_minibatchSizeTuningMax; + + // only grow at most 2 x compared to previous step + if (m_prevChosenMinibatchSize != 0.0f) + { + if (m_prevChosenMinibatchSize < chosenMinibatchSize) + { + m_prevChosenMinibatchSize = chosenMinibatchSize; + } + + fprintf(stderr, "AdaptiveMinibatchSearch: Limiting maxMinibatchSize to " + "previous minibatchSize %d*2\n", m_prevChosenMinibatchSize); + maxMinibatchSize = min(maxMinibatchSize, m_prevChosenMinibatchSize * 2); + } + + chosenMinibatchSize = SearchForBestMinibatchSize(net, refNet, refNode, epochNumber, + numFramesToUseInSearch, trainSetDataReader, + learnRatePerSample, FeatureNodes, + labelNodes, criterionNodes, + evaluationNodes, inputMatrices, + learnableNodes, smoothedGradients, + minMinibatchSize, maxMinibatchSize); + } + + return chosenMinibatchSize; + } + + size_t RoundToMultipleOf64(float val) + { + return 64 * (size_t) ((val + 32) / 64); + } + + // uses a small percentage of training data of minibatch to + // speculatively train with various MB sizes; then picks the best + size_t SearchForBestMinibatchSize(ComputationNetwork& net, + ComputationNetwork& refNet, + const ComputationNodePtr refNode, + const int epochNumber, + const size_t numFramesToUseInSearch, + IDataReader* trainSetDataReader, + const ElemType learnRatePerSample, + const std::vector& FeatureNodes, + const std::vector& labelNodes, + const std::vector& criterionNodes, + const std::vector& evaluationNodes, + std::map*>& inputMatrices, + const std::list& learnableNodes, + std::list>& smoothedGradients, + const size_t minMinibatchSize, const size_t maxMinibatchSize) + { + // may happen for automatically reduced learning rates + if (minMinibatchSize > maxMinibatchSize) + { + return maxMinibatchSize; + } + + size_t trialMinibatchSize = 0; + bool isFirstIteration = true; + ElemType baseCriterion; + + // increase the minibatch size by a factor of sqrt(2) in each step. + const float minibatchSizeTuningFactor = sqrtf(2.0f); + + size_t lastTriedtrialMinibatchSize = -1; + for (float trialMinibatchSizeFloat = (float) minMinibatchSize; + trialMinibatchSizeFloat <= maxMinibatchSize; + trialMinibatchSizeFloat *= minibatchSizeTuningFactor) + { + // round mbsize to something meaningful + trialMinibatchSize = RoundToMultipleOf64(trialMinibatchSizeFloat); + + fprintf(stderr, "\nAdaptiveMinibatchSearch: Evaluating trial minibatchSize=%d out of range %d..%d ...\n\n", + trialMinibatchSize, RoundToMultipleOf64(minMinibatchSize), RoundToMultipleOf64(maxMinibatchSize)); + + size_t totalSamplesSeen; + std::vector epochEvalErrors(evaluationNodes.size(), std::numeric_limits::infinity()); + ElemType epochCriterion = std::numeric_limits::infinity(); + + // Train on a few minibatches and so we can observe the epochCriterion as we try increasing + // minibatches with iteration of this loop. + TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, + numFramesToUseInSearch, trainSetDataReader, + learnRatePerSample, trialMinibatchSize, FeatureNodes, + labelNodes, criterionNodes, + evaluationNodes, inputMatrices, + learnableNodes, smoothedGradients, + /*out*/ epochCriterion, /*out*/ epochEvalErrors, + /*out*/ totalSamplesSeen, + isFirstIteration ? "BaseAdaptiveMinibatchSearch:" : + "AdaptiveMinibatchSearch:"); + + if (isFirstIteration) + { + // for the first iteration of the loop only, set baseCriterion + // to the result we got from TrainOneMiniEpochAndReloadModel(). + baseCriterion = epochCriterion; + lastTriedtrialMinibatchSize = trialMinibatchSize; + isFirstIteration = false; + + fprintf(stderr, "AdaptiveMinibatchSearch: Computed BaseCriterion %.10g\n", baseCriterion); + } + else if (!std::isnan(epochCriterion) && (epochCriterion > baseCriterion)) + { + fprintf(stderr, "AdaptiveMinibatchSearch: Search successful!!! Choose new minibatchSize of %d. " + "EpochCriterion = %.10g vs BaseCriterion = %.10g\n\n", + lastTriedtrialMinibatchSize, epochCriterion, baseCriterion); + + // As soon as we see the Criterion (a measure of error) start to get larger than the + // Criterion we started with, we stop. + // TODO: if this is too sensitive, we can add a margin on the bases of percentage of + // baseCriterion. + break; + } + else + { + lastTriedtrialMinibatchSize = trialMinibatchSize; + fprintf(stderr, "AdaptiveMinibatchSearch: Keep searching... " + "EpochCriterion = %.10g vs BaseCriterion = %.10g\n", + epochCriterion, baseCriterion); + } + } + + return lastTriedtrialMinibatchSize; } size_t TrainOneEpoch(ComputationNetwork& net, ComputationNetwork& refNet, - const ComputationNodePtr refNode, const int epochNumber, - const size_t epochSize, IDataReader* trainSetDataReader, + const ComputationNodePtr refNode, + const int epochNumber, + const size_t epochSize, + IDataReader* trainSetDataReader, const ElemType learnRatePerSample, + size_t tunedMBSize, const std::vector& FeatureNodes, const std::vector& labelNodes, const std::vector& criterionNodes, @@ -1218,8 +1481,10 @@ class SGD: ComputationNetworkHelper std::map*>& inputMatrices, const std::list& learnableNodes, std::list>& smoothedGradients, - ElemType& epochCriterion, std::vector& epochEvalErrors, - size_t& totalSamplesSeen) + /*out*/ ElemType& epochCriterion, + /*out*/ std::vector& epochEvalErrors, + /*out*/ size_t& totalSamplesSeen, + std::string prefixMsg = "") { ElemType readTimeInMBs = 0; ElemType ComputeTimeInMBs = 0; @@ -1239,6 +1504,7 @@ class SGD: ComputationNetworkHelper size_t numEvalNodes = epochEvalErrors.size(); //assume only one training criterion node for each epoch + Matrix localEpochCriterion(1, 1, net.GetDeviceID()); Matrix localEpochEvalErrors(1, numEvalNodes, net.GetDeviceID()); @@ -1249,7 +1515,7 @@ class SGD: ComputationNetworkHelper // resetting this, so profiling is performed for one epoch only m_numMBsToCUDAProfile = 0; - trainSetDataReader->StartMinibatchLoop(m_mbSize[epochNumber], epochNumber, m_epochSize); + trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, m_epochSize); startReadMBTime = Timer::MilliSecondElapsed(); while (trainSetDataReader->GetMinibatch(inputMatrices)) @@ -1350,18 +1616,18 @@ class SGD: ComputationNetworkHelper epochEvalErrors[i] = (const ElemType) localEpochEvalErrors(0, i); } - fprintf(stderr, "Epoch[%d]-Minibatch[%d-%d]: Samples Seen = %d Train Loss Per Sample = %.8g ", - epochNumber + 1, numMBsRun - m_numMBsToShowResult + 1, - numMBsRun, numSamplesLastMBs, + fprintf(stderr, "%s Epoch[%d of %d]-Minibatch[%d-%d of %d]: SamplesSeen = %d; TrainLossPerSample = %.8g; ", + prefixMsg.c_str(), epochNumber + 1, m_maxEpochs, numMBsRun - m_numMBsToShowResult + 1, + numMBsRun, epochSize / tunedMBSize, numSamplesLastMBs, (epochCriterion - epochCriterionLastMBs) / numSamplesLastMBs); for (size_t i = 0; i < numEvalNodes; i++) { - fprintf(stderr, "EvalErr[%lu] Per Sample = %.8g ", + fprintf(stderr, "EvalErr[%lu]PerSample = %.8g; ", i, (epochEvalErrors[i] - epochEvalErrorsLastMBs[i]) / numSamplesLastMBs); } - fprintf(stderr, "ReadData Time = %.8g Computing Time=%.8g Total Time Per Sample=%.8g\n", + fprintf(stderr, "ReadDataTime = %.8g; ComputeTime=%.8g; TotalTimePerSample=%.8g\n", readTimeInMBs, ComputeTimeInMBs, (readTimeInMBs + ComputeTimeInMBs) / numSamplesLastMBs); @@ -1482,7 +1748,8 @@ class SGD: ComputationNetworkHelper } // L1 regularizer with proximal gradient descent method - if (L1RegWeight > 0) { + if (L1RegWeight > 0) + { //*actualMBSize so that it's invariant to minibatch size since learning rate is per sample functionValues.InplaceSoftThreshold(learnRatePerSample * L1RegWeight * actualMBSize); } @@ -1536,7 +1803,8 @@ class SGD: ComputationNetworkHelper void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen, const ElemType learnRatePerSample, const std::list>& smoothedGradients, - const ElemType prevCriterion) + const ElemType prevCriterion, + const size_t minibatchSize) { wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch)); @@ -1548,6 +1816,10 @@ class SGD: ComputationNetworkHelper fstream << totalSamplesSeen << learnRatePerSample << prevCriterion; fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ELearnRate"); + fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize"); + fstream << minibatchSize; + fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize"); + fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BGradient"); for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++) @@ -1561,12 +1833,13 @@ class SGD: ComputationNetworkHelper fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ECKP"); } - bool LoadCheckPointInfo(const size_t epoch, size_t& totalSamplesSeen, + bool LoadCheckPointInfo(const size_t epochNumber, size_t& totalSamplesSeen, ElemType& learnRatePerSample, std::list>& smoothedGradients, - ElemType& prevCriterion) + ElemType& prevCriterion, + size_t& minibatchSize) { - wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epoch)); + wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epochNumber)); if (!fexists(checkPointFileName.c_str())) { fprintf(stderr, @@ -1582,6 +1855,17 @@ class SGD: ComputationNetworkHelper fstream >> totalSamplesSeen >> learnRatePerSample >> prevCriterion; fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ELearnRate"); + if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize")) + { + fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize"); + fstream >> minibatchSize; + fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize"); + } + else + { + minibatchSize =m_mbSize[epochNumber]; + } + fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient"); for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++) @@ -1850,8 +2134,13 @@ class SGD: ComputationNetworkHelper bool m_needToNormalizeLRByParallUtterance; intargvector m_mbSize; + + // the number of samples in each epoch (0 means, use all the samples in each epoch). size_t m_epochSize; + + // the total number of epochs to run. size_t m_maxEpochs; + floatargvector m_momentumInputPerMB; ElemType m_momentumPerMB; bool m_gradientClippingWithTruncation; @@ -1880,6 +2169,10 @@ class SGD: ComputationNetworkHelper ElemType m_increaseLearnRateIfImproveMoreThan; ElemType m_learnRateIncreaseFactor; ElemType m_learnRateDecreaseFactor; + size_t m_prevChosenMinibatchSize; + bool m_autoAdjustMinibatch; + size_t m_minibatchSizeTuningFrequency; + size_t m_minibatchSizeTuningMax; floatargvector m_dropoutRates; size_t m_maxTempMemSizeInSamplesForCNN; From cac315623bdea2dfeab42d43f5bafa6a61a8356c Mon Sep 17 00:00:00 2001 From: Amit Agarwal Date: Mon, 6 Jul 2015 16:16:54 -0700 Subject: [PATCH 2/6] Fixed Windows build --- MachineLearning/CNTK/SGD.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/MachineLearning/CNTK/SGD.h b/MachineLearning/CNTK/SGD.h index 266b8331b929..fc2eb0cf1c4c 100644 --- a/MachineLearning/CNTK/SGD.h +++ b/MachineLearning/CNTK/SGD.h @@ -639,7 +639,7 @@ class SGD : ComputationNetworkHelper size_t totalSamplesSeen = 0; ElemType learnRatePerSample = 0.5f / m_mbSize[startEpoch]; - float learningRateAdjustmentFactor = 1.0f; + ElemType learningRateAdjustmentFactor = 1.0f; vector prevLearnRates; prevLearnRates.resize(m_numPrevLearnRates); for (int i = 0; i < m_numPrevLearnRates; i++) @@ -1282,7 +1282,7 @@ class SGD : ComputationNetworkHelper const ComputationNodePtr refNode, const int epochNumber, const size_t numFramesToUseInSearch, - const IDataReader* trainSetDataReader, + IDataReader* trainSetDataReader, const ElemType learnRatePerSample, const size_t initialMinibatchSize, const std::vector& FeatureNodes, @@ -1292,7 +1292,7 @@ class SGD : ComputationNetworkHelper std::map*>& inputMatrices, const std::list& learnableNodes, std::list>& smoothedGradients, - const float learningRateAdjustmentFactor) + const ElemType learningRateAdjustmentFactor) { size_t minMinibatchSize = initialMinibatchSize; size_t chosenMinibatchSize = initialMinibatchSize; @@ -1300,7 +1300,7 @@ class SGD : ComputationNetworkHelper // do some pre-adjustment based on LR // Basically we assume that the LR for epoch 1 is safe for mbsize. // If LR control led to a smaller LR, then we can safely increase the lower bound of the MB size. - float learningRateChangeSoFar = m_learningRatesPerSample[epochNumber] / m_learningRatesPerSample[0]; + ElemType learningRateChangeSoFar = m_learningRatesPerSample[epochNumber] / m_learningRatesPerSample[0]; learningRateChangeSoFar *= learningRateAdjustmentFactor; // increasing by the full factor is found to be too aggressive; sqrt() seems more robust @@ -1373,6 +1373,11 @@ class SGD : ComputationNetworkHelper return 64 * (size_t) ((val + 32) / 64); } + size_t RoundToMultipleOf64(size_t val) + { + return 64 * ((val + 32) / 64); + } + // uses a small percentage of training data of minibatch to // speculatively train with various MB sizes; then picks the best size_t SearchForBestMinibatchSize(ComputationNetwork& net, @@ -1399,12 +1404,12 @@ class SGD : ComputationNetworkHelper size_t trialMinibatchSize = 0; bool isFirstIteration = true; - ElemType baseCriterion; + ElemType baseCriterion = 0; // increase the minibatch size by a factor of sqrt(2) in each step. const float minibatchSizeTuningFactor = sqrtf(2.0f); - size_t lastTriedtrialMinibatchSize = -1; + size_t lastTriedtrialMinibatchSize = 0; for (float trialMinibatchSizeFloat = (float) minMinibatchSize; trialMinibatchSizeFloat <= maxMinibatchSize; trialMinibatchSizeFloat *= minibatchSizeTuningFactor) From 54c1ac43491dfe2432703b675cec4a881c9de9e8 Mon Sep 17 00:00:00 2001 From: Yinggong ZHAO Date: Mon, 6 Jul 2015 18:30:33 -0700 Subject: [PATCH 3/6] Update softmax GPU NCE training --- MachineLearning/CNTK/SGD.h | 2 ++ Math/Math/CPUMatrix.cpp | 5 +++++ Math/Math/CPUMatrix.h | 2 ++ Math/Math/GPUMatrix.cu | 6 +++++- Math/Math/Matrix.cpp | 12 ++++++++++++ Math/Math/Matrix.h | 2 +- 6 files changed, 27 insertions(+), 2 deletions(-) diff --git a/MachineLearning/CNTK/SGD.h b/MachineLearning/CNTK/SGD.h index b91352f452d8..589e73370900 100644 --- a/MachineLearning/CNTK/SGD.h +++ b/MachineLearning/CNTK/SGD.h @@ -1252,6 +1252,8 @@ class SGD: ComputationNetworkHelper trainSetDataReader->StartMinibatchLoop(m_mbSize[epochNumber], epochNumber, m_epochSize); startReadMBTime = Timer::MilliSecondElapsed(); + int a = 0; + if (a) while (trainSetDataReader->GetMinibatch(inputMatrices)) { #ifdef MPI_SUPPORT diff --git a/Math/Math/CPUMatrix.cpp b/Math/Math/CPUMatrix.cpp index 03e87cf85c81..f36fbe0e3dc6 100644 --- a/Math/Math/CPUMatrix.cpp +++ b/Math/Math/CPUMatrix.cpp @@ -3841,6 +3841,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { { return CPUMatrix::MultiplyAndWeightedAdd(1.0, a, transposeA, b, transposeB, 1.0, c); } + template + void CPUMatrix::AssignSoftmaxSum(const CPUMatrix& a, CPUMatrix& softmax) + { + + } template void CPUMatrix::AssignNCEUnnormalizedEval(const CPUMatrix& a, diff --git a/Math/Math/CPUMatrix.h b/Math/Math/CPUMatrix.h index 3cbdc593eb25..874b8b93f177 100644 --- a/Math/Math/CPUMatrix.h +++ b/Math/Math/CPUMatrix.h @@ -217,6 +217,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { void AssignNoiseContrastiveEstimation(const CPUMatrix& a, const CPUMatrix& b, const CPUMatrix& bias, CPUMatrix& tmp, CPUMatrix& c); + + void AssignSoftmaxSum(const CPUMatrix& a, CPUMatrix& softmax); void AssignNCEUnnormalizedEval(const CPUMatrix& a, const CPUMatrix& b, const CPUMatrix& bias, CPUMatrix& c); diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu index 8cf6f536d767..a486eddfc278 100755 --- a/Math/Math/GPUMatrix.cu +++ b/Math/Math/GPUMatrix.cu @@ -1929,7 +1929,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); } - + template + void GPUMatrix::AssignSoftmaxSum(const GPUMatrix& a, GPUMatrix& softmax) + { + + } template void GPUMatrix::AssignNCEUnnormalizedEval(const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) { diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp index c58862175694..fd42fae109d5 100755 --- a/Math/Math/Matrix.cpp +++ b/Math/Math/Matrix.cpp @@ -3623,6 +3623,18 @@ namespace Microsoft { namespace MSR { namespace CNTK { return *this; } + + template + Matrix& Matrix::AssignSoftmaxSum(const Matrix& a, const Matrix& softmax) + { + this->Resize(1, 1); + if (this->GetDeviceId() < 0) + a.m_CPUMatrix->AssignSoftmaxSum(*softmax.m_CPUMatrix, *this->m_CPUMatrix); + else + a.m_GPUMatrix->AssignSoftmaxSum(*softmax.m_GPUMatrix, *this->m_GPUMatrix); + return *this; + } + template Matrix& Matrix::AssignNceUnnormalizedEval(const Matrix& a, const Matrix& b, const Matrix& c, const Matrix& bias) { diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h index 9476263228f2..8875fa9160f4 100644 --- a/Math/Math/Matrix.h +++ b/Math/Math/Matrix.h @@ -147,7 +147,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { Matrix& AssignNoiseContrastiveEstimation(const Matrix& a, const Matrix& b, const Matrix& c, const Matrix& bias, Matrix& tmp); Matrix& AssignNCEDerivative(const Matrix& tmp, const Matrix& a, const Matrix& b, const Matrix& c, size_t inputIndex); - + Matrix& AssignSoftmaxSum(const Matrix& a, const Matrix& softmax); Matrix& AssignNceUnnormalizedEval(const Matrix& a, const Matrix& b, const Matrix& c, const Matrix& bias); Matrix Transpose(); // This method doesn't change state of Matrix. It should be a const function From 204b879dfe6fb178cdee45e516fe0ed7dea3ef95 Mon Sep 17 00:00:00 2001 From: Yinggong ZHAO Date: Mon, 6 Jul 2015 20:56:16 -0700 Subject: [PATCH 4/6] Finish GPU NCE training --- .../LMSequenceReader/SequenceReader.cpp | 2 +- MachineLearning/CNTK/SGD.h | 2 - MachineLearning/CNTK/TrainingCriterionNodes.h | 15 +++--- Math/Math/CPUMatrix.cpp | 12 ++++- Math/Math/GPUMatrix.cu | 18 ++++++- Math/Math/GPUMatrix.h | 2 +- Math/Math/GPUMatrixCUDAKernels.cu | 53 +++++++++++++++++++ Math/Math/Matrix.cpp | 6 ++- Math/Math/NoGPU.cpp | 4 ++ 9 files changed, 97 insertions(+), 17 deletions(-) diff --git a/DataReader/LMSequenceReader/SequenceReader.cpp b/DataReader/LMSequenceReader/SequenceReader.cpp index a4241cb7a5df..bd617a2ca152 100644 --- a/DataReader/LMSequenceReader/SequenceReader.cpp +++ b/DataReader/LMSequenceReader/SequenceReader.cpp @@ -2052,7 +2052,7 @@ void BatchSequenceReader::GetLabelOutput(std::mapTransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false); + labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, false, false, false); } } diff --git a/MachineLearning/CNTK/SGD.h b/MachineLearning/CNTK/SGD.h index 45d51169f89e..fc2eb0cf1c4c 100644 --- a/MachineLearning/CNTK/SGD.h +++ b/MachineLearning/CNTK/SGD.h @@ -1523,8 +1523,6 @@ class SGD : ComputationNetworkHelper trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, m_epochSize); startReadMBTime = Timer::MilliSecondElapsed(); - int a = 0; - if (a) while (trainSetDataReader->GetMinibatch(inputMatrices)) { #ifdef MPI_SUPPORT diff --git a/MachineLearning/CNTK/TrainingCriterionNodes.h b/MachineLearning/CNTK/TrainingCriterionNodes.h index 14013951ca1e..803fcd3ecadb 100644 --- a/MachineLearning/CNTK/TrainingCriterionNodes.h +++ b/MachineLearning/CNTK/TrainingCriterionNodes.h @@ -948,10 +948,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (Inputs(0)->FunctionValues().GetNumRows() == 1) { for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++) - if (Inputs(0)->FunctionValues()(0, i) > 0) - positive++; - else if (Inputs(0)->FunctionValues()(0, i) < 0) - negative++; + { + if (Inputs(0)->FunctionValues()(0, i) > 0) + positive++; + else if (Inputs(0)->FunctionValues()(0, i) < 0) + negative++; + } assert(positive * negative == 0); } if (m_evalMode == NCEEvalMode::Softmax || (Inputs(0)->FunctionValues().GetNumRows() == 1 && positive > 0)) @@ -960,10 +962,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_logSoftmax.AssignProductOf(Inputs(1)->FunctionValues(), true, Inputs(2)->FunctionValues(), false); m_logSoftmax += Inputs(3)->FunctionValues(); m_logSoftmax.InplaceLogSoftmax(false); - FunctionValues().Resize(1, 1); - FunctionValues().SetValue(0); - for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++) - FunctionValues()(0, 0) -= m_logSoftmax(i, (size_t)Inputs(0)->FunctionValues()(0, i)); + FunctionValues().AssignSoftmaxSum(Inputs(0)->FunctionValues(), m_logSoftmax); } else if (m_evalMode == NCEEvalMode::Unnormalized || (Inputs(0)->FunctionValues().GetNumRows() == 1 && negative > 0)) { diff --git a/Math/Math/CPUMatrix.cpp b/Math/Math/CPUMatrix.cpp index f36fbe0e3dc6..03379ebead61 100644 --- a/Math/Math/CPUMatrix.cpp +++ b/Math/Math/CPUMatrix.cpp @@ -3842,9 +3842,17 @@ namespace Microsoft { namespace MSR { namespace CNTK { return CPUMatrix::MultiplyAndWeightedAdd(1.0, a, transposeA, b, transposeB, 1.0, c); } template - void CPUMatrix::AssignSoftmaxSum(const CPUMatrix& a, CPUMatrix& softmax) + void CPUMatrix::AssignSoftmaxSum(const CPUMatrix& softmax, CPUMatrix& c) { - + ElemType log_likelihood = 0.0; + size_t batch_size = this->GetNumCols(); +#pragma omp parallel for reduction(+:log_likelihood) + for (int instance_id = 0; instance_id < batch_size; instance_id++) + { + int sample = (int)(*this)(0, instance_id); + log_likelihood += softmax(instance_id, sample); + } + c(0, 0) = -log_likelihood; } template diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu index a486eddfc278..5f36df1724e6 100755 --- a/Math/Math/GPUMatrix.cu +++ b/Math/Math/GPUMatrix.cu @@ -1930,9 +1930,25 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (do_sync) CUDA_CALL(cudaEventDestroy(done)); } template - void GPUMatrix::AssignSoftmaxSum(const GPUMatrix& a, GPUMatrix& softmax) + void GPUMatrix::AssignSoftmaxSum(const GPUMatrix& a, GPUMatrix& c) { + UNCONST(ElemType, a, my_a); + cudaEvent_t done = nullptr; + if (do_sync) CUDA_CALL(cudaEventCreate(&done)); + int p = 512; + int width = a.GetNumRows(); + while (p / 2 > width) p = p / 2; + _assignSoftmaxSum << <1, p >> >( + my_a.GetArray(), + width, + GetArray(), + c.GetArray() + ); + + if (do_sync) CUDA_CALL(cudaEventRecord(done)); + if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); + if (do_sync) CUDA_CALL(cudaEventDestroy(done)); } template void GPUMatrix::AssignNCEUnnormalizedEval(const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) diff --git a/Math/Math/GPUMatrix.h b/Math/Math/GPUMatrix.h index 65792fec02cf..f7314592a4b4 100755 --- a/Math/Math/GPUMatrix.h +++ b/Math/Math/GPUMatrix.h @@ -294,7 +294,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { size_t sampleCount, GPUMatrix& tmp, GPUMatrix& c); void AssignNCEDerivative(GPUMatrix& tmp, const GPUMatrix& a, const GPUMatrix& b, size_t inputIndex, GPUMatrix& c); void AssignNCEUnnormalizedEval(const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c); - + void AssignSoftmaxSum(const GPUMatrix& a, GPUMatrix& softmax); void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const; void Print(const char* matrixName = NULL) const; //print whole matrix. can be expensive diff --git a/Math/Math/GPUMatrixCUDAKernels.cu b/Math/Math/GPUMatrixCUDAKernels.cu index 6d37ede64092..c9a6c67bc0f0 100755 --- a/Math/Math/GPUMatrixCUDAKernels.cu +++ b/Math/Math/GPUMatrixCUDAKernels.cu @@ -2868,6 +2868,59 @@ __global__ void _computeNceOutput( } } + +template +__global__ void _assignSoftmaxSum( + const ElemType* softmax, + int sampleCount, + const ElemType* a, + ElemType* c) // run on 512 threads per block +{ + // val and col are in CSR format + // val is an array contains log_Pn(w). To differentiate positive and negative samples, + // we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples + // col is an array contains index of the word samples + // a is a matrix in column major format contains output from hidden layer + // b is the weight matrix for output layer + // tmp is the buffer that stores NCE output calculated from _computeNceOutput + // c is the matrix to store objective + + __shared__ ElemType partials[512]; + partials[threadIdx.x] = 0; + + int total = sampleCount; + int loadPerThread = (total + blockDim.x - 1) / blockDim.x; + + // find out the items this thread is responsible for + int start = loadPerThread * threadIdx.x; + int end = min(total, loadPerThread * (threadIdx.x + 1)); + for (int i = start; i < end; i++) + { + int wid = (int)a[i]; + partials[threadIdx.x] += softmax[IDX2C(i, wid, sampleCount)]; + } + + __syncthreads(); + + // now sum up the objective function + int nTotalThreads = blockDim.x; + + while (nTotalThreads >1) + { + int halfPoint = (nTotalThreads >> 1); + + if (threadIdx.x < halfPoint) + partials[threadIdx.x] += partials[threadIdx.x + halfPoint]; + + __syncthreads(); + + nTotalThreads = (nTotalThreads >> 1); + } + + if (threadIdx.x == 0) + c[0] = -partials[0]; +} + template __global__ void _assignNoiseContrastiveEstimation( const ElemType* val, diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp index fd42fae109d5..75cdd2f9b11a 100755 --- a/Math/Math/Matrix.cpp +++ b/Math/Math/Matrix.cpp @@ -747,9 +747,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { #define NUM_MATRIXTYPE_CHANGED_WARN 20 m_numTimesMatrixTypeChanged++; + if (m_numTimesMatrixTypeChanged == NUM_MATRIXTYPE_CHANGED_WARN) - fprintf(stderr, "WARNING: The same matrix with dim [%lu, %lu] has been transferred between different devices for %d times.\n", (unsigned long)GetNumRows(), (unsigned long)GetNumCols(), NUM_MATRIXTYPE_CHANGED_WARN); - + { + fprintf(stderr, "WARNING: The same matrix with dim [%lu, %lu] has been transferred between different devices for %d times.\n", (unsigned long)GetNumRows(), (unsigned long)GetNumCols(), NUM_MATRIXTYPE_CHANGED_WARN); + } if (GetDeviceId()<0) //CPU { if (newMatrixType==MatrixType::SPARSE) diff --git a/Math/Math/NoGPU.cpp b/Math/Math/NoGPU.cpp index 47aba454eb7c..b9a3c09d1d0d 100644 --- a/Math/Math/NoGPU.cpp +++ b/Math/Math/NoGPU.cpp @@ -1067,6 +1067,10 @@ namespace Microsoft { } + template + void GPUMatrix::AssignSoftmaxSum(const GPUMatrix& a, GPUMatrix& c) + { + } template void GPUMatrix::AssignNCEUnnormalizedEval(const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) From 4d14e516df410d3027413165acae76d01a4f6817 Mon Sep 17 00:00:00 2001 From: Yinggong ZHAO Date: Tue, 7 Jul 2015 00:20:24 -0700 Subject: [PATCH 5/6] Fix a bug in restoring from previous model in SGD --- MachineLearning/CNTK/SGD.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MachineLearning/CNTK/SGD.h b/MachineLearning/CNTK/SGD.h index fc2eb0cf1c4c..cac467763bd5 100644 --- a/MachineLearning/CNTK/SGD.h +++ b/MachineLearning/CNTK/SGD.h @@ -1862,7 +1862,7 @@ class SGD : ComputationNetworkHelper if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize")) { - fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize"); + //fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize"); fstream >> minibatchSize; fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize"); } From 496dfcb4c2665e562787b402982d022778f2fc15 Mon Sep 17 00:00:00 2001 From: Chris Basoglu Date: Tue, 7 Jul 2015 10:52:06 -0700 Subject: [PATCH 6/6] Cleanup after Adaptive Minibatch size change --- MachineLearning/CNTK/SGD.h | 122 +++++++++++++++++++++---------------- 1 file changed, 71 insertions(+), 51 deletions(-) diff --git a/MachineLearning/CNTK/SGD.h b/MachineLearning/CNTK/SGD.h index cac467763bd5..bc734c4e907a 100644 --- a/MachineLearning/CNTK/SGD.h +++ b/MachineLearning/CNTK/SGD.h @@ -669,8 +669,12 @@ class SGD : ComputationNetworkHelper bool learnRateInitialized = false; if (startEpoch > 0) { - learnRateInitialized = LoadCheckPointInfo(startEpoch - 1, totalSamplesSeen, - learnRatePerSample, smoothedGradients, prevCriterion, m_prevChosenMinibatchSize); + learnRateInitialized = LoadCheckPointInfo(startEpoch - 1, + /*out*/ totalSamplesSeen, + /*out*/ learnRatePerSample, + smoothedGradients, + /*out*/ prevCriterion, + /*out*/ m_prevChosenMinibatchSize); if (learnRateInitialized) { prevLearnRates[startEpoch % m_numPrevLearnRates] = learnRatePerSample; @@ -908,9 +912,12 @@ class SGD : ComputationNetworkHelper net.LoadPersistableParametersFromFile(GetModelNameForEpoch(i - 1), m_validateAfterModelReloading); net.ResetEvalTimeStamp(); - LoadCheckPointInfo(i - 1, totalSamplesSeen, - learnRatePerSample, - smoothedGradients, prevCriterion, m_prevChosenMinibatchSize); + LoadCheckPointInfo(i - 1, + /*out*/ totalSamplesSeen, + /*out*/ learnRatePerSample, + smoothedGradients, + /*out*/ prevCriterion, + /*out*/ m_prevChosenMinibatchSize); fprintf(stderr, "Loaded the previous model which has better training criterion.\n"); loadedPrevModel = true; } @@ -989,12 +996,12 @@ class SGD : ComputationNetworkHelper } } - //since we linked feature nodes. we need to remove it from the deletion + // since we linked feature nodes. we need to remove it from the deletion if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) { for (size_t i = 0; i < refFeatureNodes.size(); i++) { - //note we need to handle deletion carefully + // note we need to handle deletion carefully refNet.ChangeNode(refFeatureNodes[i]->NodeName(), refFeatureNodes[i]); } } @@ -1054,7 +1061,7 @@ class SGD : ComputationNetworkHelper } } - //mark done + // mark done for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++) { PreComputedNode* node = static_cast*>(*nodeIter); @@ -1111,8 +1118,12 @@ class SGD : ComputationNetworkHelper ElemType learnRate = learnRatePerSample; size_t dummyMinibatchSize = 0; - LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, learnRate, - smoothedGradients, prevCriterion, dummyMinibatchSize); + LoadCheckPointInfo(baseModelEpoch, + /*out*/ totalSamplesSeen, + /*out*/ learnRate, + smoothedGradients, + /*out*/ prevCriterion, + /*out*/ dummyMinibatchSize); // if model is not changed this is what we will get TrainOneMiniEpochAndReloadModel(net, refNet, refNode, epochNumber, @@ -1120,8 +1131,9 @@ class SGD : ComputationNetworkHelper FeatureNodes, labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, - smoothedGradients, baseCriterion, - epochEvalErrors, totalSamplesSeen, "BaseAdaptiveLearnRateSearch:"); + smoothedGradients, /*out*/ baseCriterion, + /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen, + "BaseAdaptiveLearnRateSearch:"); if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch) { @@ -1149,8 +1161,8 @@ class SGD : ComputationNetworkHelper labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, smoothedGradients, - epochCriterion, epochEvalErrors, - totalSamplesSeen, "AdaptiveLearnRateSearch:"); + /*out*/ epochCriterion, /*out*/ epochEvalErrors, + /*out*/ totalSamplesSeen, "AdaptiveLearnRateSearch:"); } while (std::isnan(epochCriterion) || (epochCriterion > baseCriterion && learnRatePerSample > minLearnRate)); @@ -1170,8 +1182,9 @@ class SGD : ComputationNetworkHelper FeatureNodes, labelNodes, criterionNodes, evaluationNodes, inputMatrices, learnableNodes, - smoothedGradients, leftCriterion, - epochEvalErrors, totalSamplesSeen, "DetailBaseAdaptiveLearnRateSearch:"); + smoothedGradients, /*out*/ leftCriterion, + /*out*/ epochEvalErrors, /*out*/ totalSamplesSeen, + "DetailBaseAdaptiveLearnRateSearch:"); while (rightLearnRatePerSample > leftLearnRatePerSample * 1.2f) { @@ -1189,9 +1202,10 @@ class SGD : ComputationNetworkHelper inputMatrices, learnableNodes, smoothedGradients, - rightCriterion, - epochEvalErrors, - totalSamplesSeen, "DetailRightAdaptiveLearnRateSearch:"); + /*out*/ rightCriterion, + /*out*/ epochEvalErrors, + /*out*/ totalSamplesSeen, + "DetailRightAdaptiveLearnRateSearch:"); } else { @@ -1207,9 +1221,10 @@ class SGD : ComputationNetworkHelper inputMatrices, learnableNodes, smoothedGradients, - leftCriterion, - epochEvalErrors, - totalSamplesSeen, "DetailLeftAdaptiveLearnRateSearch:"); + /*out*/ leftCriterion, + /*out*/ epochEvalErrors, + /*out*/ totalSamplesSeen, + "DetailLeftAdaptiveLearnRateSearch:"); } } @@ -1235,8 +1250,9 @@ class SGD : ComputationNetworkHelper const std::vector& evaluationNodes, std::map*>& inputMatrices, const std::list& learnableNodes, - /*out*/ std::list>& smoothedGradients, - /*out*/ ElemType& epochCriterion, std::vector& epochEvalErrors, + std::list>& smoothedGradients, + /*out*/ ElemType& epochCriterion, + /*out*/ std::vector& epochEvalErrors, /*out*/ size_t& totalSamplesSeen, std::string prefixMsg = "") { @@ -1273,8 +1289,12 @@ class SGD : ComputationNetworkHelper ElemType dummyLearnRate; ElemType dummtPrevCriterion; size_t dummyMinibatchSize = 0; - LoadCheckPointInfo(baseModelEpoch, totalSamplesSeen, dummyLearnRate, - smoothedGradients, dummtPrevCriterion, dummyMinibatchSize); + LoadCheckPointInfo(baseModelEpoch, + /*out*/ totalSamplesSeen, + /*out*/ dummyLearnRate, + smoothedGradients, + /*out*/ dummtPrevCriterion, + /*out*/ dummyMinibatchSize); } size_t AdaptiveMinibatchSizing(ComputationNetwork& net, @@ -1501,14 +1521,14 @@ class SGD : ComputationNetworkHelper unsigned long long startReadMBTime = 0, startComputeMBTime = 0; unsigned long long endReadMBTime = 0, endComputeMBTime = 0; - //initialize statistics + // initialize statistics size_t totalEpochSamples = 0; int numMBsRun = 0; size_t numEvalNodes = epochEvalErrors.size(); - //assume only one training criterion node for each epoch + // assume only one training criterion node for each epoch Matrix localEpochCriterion(1, 1, net.GetDeviceID()); Matrix localEpochEvalErrors(1, numEvalNodes, net.GetDeviceID()); @@ -1550,7 +1570,7 @@ class SGD : ComputationNetworkHelper throw std::logic_error("cannot pass gradient checker"); } #endif - //TODO: currently only support one node regularization + // TODO: currently only support one node regularization if (m_needRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode != nullptr) { refNet.SetActualMiniBatchSize(actualMBSize); @@ -1562,15 +1582,15 @@ class SGD : ComputationNetworkHelper labelNodes[0]->FunctionValues()); } - //only compute gradient when learning rate is large enough + // only compute gradient when learning rate is large enough if (learnRatePerSample > m_minLearnRate * 0.01) { - //use only the first criterion. Is there any possibility to use more? + // use only the first criterion. Is there any possibility to use more? net.ComputeGradient(criterionNodes[0]); } else { - //use only the first criterion. Is there any possibility to use more? + // use only the first criterion. Is there any possibility to use more? net.Evaluate(criterionNodes[0]); } @@ -1656,8 +1676,8 @@ class SGD : ComputationNetworkHelper break; } - /// call DataEnd function - /// DataEnd does reader specific process if sentence ending is reached + // call DataEnd function + // DataEnd does reader specific process if sentence ending is reached trainSetDataReader->DataEnd(endDataSentence); profiler.NextSample(); @@ -1715,7 +1735,7 @@ class SGD : ComputationNetworkHelper // L2 regularizer if (L2RegWeight > 0) { - //*actualMBSize so that it's invariant to minibatch size since learning rate is per sample + // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample Matrix::ScaleAndAdd(L2RegWeight * actualMBSize, functionValues, gradientValues); } @@ -1723,7 +1743,7 @@ class SGD : ComputationNetworkHelper { ElemType momentum = sgd->MomentumPerMB(); - //we use simple linear (instead of log linear) scaling here + // we use simple linear (instead of log linear) scaling here if (actualMBSize < expectedMBSize && momentum > 0.0000001f) { momentum = (ElemType) exp(log(momentum) / expectedMBSize * actualMBSize); @@ -1755,7 +1775,7 @@ class SGD : ComputationNetworkHelper // L1 regularizer with proximal gradient descent method if (L1RegWeight > 0) { - //*actualMBSize so that it's invariant to minibatch size since learning rate is per sample + // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample functionValues.InplaceSoftThreshold(learnRatePerSample * L1RegWeight * actualMBSize); } @@ -1794,7 +1814,7 @@ class SGD : ComputationNetworkHelper } else { - //norm2 normalized + // norm2 normalized ElemType gradientNorm = gradient.FrobeniusNorm(); if (gradientNorm > maxGradientPerMB) { @@ -1829,7 +1849,7 @@ class SGD : ComputationNetworkHelper for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++) { - const Matrix& smoothedGradient = (*smoothedGradientIter); + const Matrix& smoothedGradient = *smoothedGradientIter; fstream << smoothedGradient; } @@ -1838,11 +1858,12 @@ class SGD : ComputationNetworkHelper fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ECKP"); } - bool LoadCheckPointInfo(const size_t epochNumber, size_t& totalSamplesSeen, - ElemType& learnRatePerSample, + bool LoadCheckPointInfo(const size_t epochNumber, + /*out*/ size_t& totalSamplesSeen, + /*out*/ ElemType& learnRatePerSample, std::list>& smoothedGradients, - ElemType& prevCriterion, - size_t& minibatchSize) + /*out*/ ElemType& prevCriterion, + /*out*/ size_t& minibatchSize) { wstring checkPointFileName = GetCheckPointFileNameForEpoch(int(epochNumber)); if (!fexists(checkPointFileName.c_str())) @@ -1862,20 +1883,19 @@ class SGD : ComputationNetworkHelper if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize")) { - //fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BMinibatchSize"); fstream >> minibatchSize; fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EMinibatchSize"); } else { - minibatchSize =m_mbSize[epochNumber]; + minibatchSize = m_mbSize[epochNumber]; } fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient"); for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++) { - Matrix& smoothedGradient = (*smoothedGradientIter); + Matrix& smoothedGradient = *smoothedGradientIter; fstream >> smoothedGradient; } fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EGradient"); @@ -1905,12 +1925,12 @@ class SGD : ComputationNetworkHelper } - //return -1 if nothing exists + // return -1 if nothing exists int DetermineStartEpoch(const bool makeMode) { if (!makeMode) { - //always start from scratch + // always start from scratch return -1; } @@ -2048,7 +2068,7 @@ class SGD : ComputationNetworkHelper node->UpdateEvalTimeStamp(); - //use only the first criterion. Is + // use only the first criterion. Is net.ComputeGradient(criterionNodes[npos]); if (node->GradientValues().GetMatrixType() == MatrixType::SPARSE) @@ -2092,7 +2112,7 @@ class SGD : ComputationNetworkHelper node->UpdateEvalTimeStamp(); net.Evaluate(criterionNodes[npos]); - //criterionNode should be a scalar + // criterionNode should be a scalar ElemType mbEvalCriNeg = criterionNodes[npos]->FunctionValues().Get00Element(); // back to its orginal parameter value @@ -2168,7 +2188,7 @@ class SGD : ComputationNetworkHelper ElemType m_reduceLearnRateIfImproveLessThan; bool m_continueReduce; - //determine after how many epochs the learning rate should be auto adjusted. + // determine after how many epochs the learning rate should be auto adjusted. size_t m_learnRateAdjustInterval; ElemType m_increaseLearnRateIfImproveMoreThan;