diff --git a/Source/ActionsLib/ActionsLib.vcxproj.filters b/Source/ActionsLib/ActionsLib.vcxproj.filters index 38cc00b65315..030da8855a28 100644 --- a/Source/ActionsLib/ActionsLib.vcxproj.filters +++ b/Source/ActionsLib/ActionsLib.vcxproj.filters @@ -13,15 +13,15 @@ Actions - - Actions - Actions Actions + + Actions + diff --git a/Source/CNTK/CNTK.vcxproj b/Source/CNTK/CNTK.vcxproj index cf2ca3d3b392..df1ad3f7ce93 100644 --- a/Source/CNTK/CNTK.vcxproj +++ b/Source/CNTK/CNTK.vcxproj @@ -174,8 +174,6 @@ - - diff --git a/Source/CNTK/CNTK.vcxproj.filters b/Source/CNTK/CNTK.vcxproj.filters index e00025801910..e794c5e10eb1 100644 --- a/Source/CNTK/CNTK.vcxproj.filters +++ b/Source/CNTK/CNTK.vcxproj.filters @@ -175,9 +175,6 @@ from ComputationNetworkLib\Network - - from ComputationNetworkLib\Network - from ComputationNetworkLib\Nodes @@ -187,9 +184,6 @@ from ComputationNetworkLib\Nodes - - from ComputationNetworkLib\Network - from ComputationNetworkLib\Nodes diff --git a/Source/SGDLib/IComputationNetBuilder.h b/Source/SGDLib/IComputationNetBuilder.h deleted file mode 100644 index 2dabf55e3a3f..000000000000 --- a/Source/SGDLib/IComputationNetBuilder.h +++ /dev/null @@ -1,30 +0,0 @@ -#if 1 // only needed for some unused code in MultiNetworksSGD.h -// -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// -#pragma once - -#include "ComputationNetwork.h" -#include - -namespace Microsoft { namespace MSR { namespace CNTK { - -// This interface provides only one method: BuildNetworkFromDescription(). -// There are two variants currently: -// - SimpleNetworkBuilder: standard networks built from a few parameters -// - NDLNetworkBuilder: networks built using the old CNTK NDL -// The use of this interface is very local (eventually it will be local to DoTrain() only), so there will no longer be a need to even have this interface. -// Models created through BrainScript (or Python) do not go through this interface. - -template -/*interface*/ struct IComputationNetBuilder -{ - virtual ComputationNetworkPtr BuildNetworkFromDescription(ComputationNetwork* = nullptr) = 0; - virtual ~IComputationNetBuilder(){}; -}; -} -} -} -#endif diff --git a/Source/SGDLib/MultiNetworksEvaluator.h b/Source/SGDLib/MultiNetworksEvaluator.h deleted file mode 100644 index 2b9604d295c5..000000000000 --- a/Source/SGDLib/MultiNetworksEvaluator.h +++ /dev/null @@ -1,1085 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// -// MultiNetworksEvaluator/SGD -- This represents earlier efforts to use CNTK for sequence-to-sequence modeling. This is no longer the intended design. -// -#pragma once - -#include "Basics.h" -#include "Helpers.h" // for foreach_column() macro -#include "fileutil.h" -#include "DataReader.h" -#include "DataWriter.h" -#include "ComputationNetwork.h" -#include "DataReaderHelpers.h" -#include "SimpleEvaluator.h" -#include "TrainingCriterionNodes.h" // TODO: we should move the functions that depend on these to the .cpp -#include "CompositeComputationNodes.h" -#include -#include -#include -#include -#include - -using namespace std; - -namespace Microsoft { namespace MSR { namespace CNTK { - -template -struct NN_state -{ - map> hidden_activity; -}; - -template -struct Token -{ - Token(const double score, const std::vector& sequence, const NN_state& state) - : score(score), sequence(sequence), state(state) - { - } - bool operator<(const Token& t) const - { - return score < t.score; - } - double score; - vector sequence; - NN_state state; -}; - -template -class MultiNetworksEvaluator : public SimpleEvaluator -{ - typedef SimpleEvaluator Base; - using Base::m_net; - using Base::m_numMBsToShowResult; - using Base::m_traceLevel; - using Base::DisplayEvalStatistics; - typedef shared_ptr> ComputationNodePtr; - typedef ClassBasedCrossEntropyWithSoftmaxNode* ClassBasedCrossEntropyWithSoftmaxNodePtr; - -public: - MultiNetworksEvaluator(ComputationNetworkPtr net, const size_t numMBsToShowResult = 100, const int traceLevel = 0) - : Base(net, numMBsToShowResult, traceLevel) - { - } - - //returns error rate - // This was a special early implementation of RNNs by emulating them as a DNN. - // The code is very restricted to simple RNNs. - // The idea can be used for more complicated network but need to know which nodes are stateful or time-dependent so that unroll is done in a correct way to represent recurrent networks. - // TODO: can probably be removed. - double EvaluateUnroll(IDataReader* dataReader, const size_t mbSize, double& evalSetCrossEntropy, const wchar_t* output = nullptr, const size_t testSize = requestDataSize) - { - std::vector& featureNodes = m_net->FeatureNodes(); - std::vector& labelNodes = m_net->LabelNodes(); - std::vector& criterionNodes = m_net->FinalCriterionNodes(); - std::vector& evaluationNodes = m_net->EvaluationNodes(); - - if (criterionNodes.size() == 0) - RuntimeError("No CrossEntropyWithSoftmax node found\n"); - if (evaluationNodes.size() == 0) - RuntimeError("No Evaluation node found\n"); - - std::map*> inputMatrices; - for (size_t i = 0; i < featureNodes.size(); i++) - inputMatrices[featureNodes[i]->NodeName()] = &dynamic_pointer_cast>(featureNodes[i])->Value(); - for (size_t i = 0; i < labelNodes.size(); i++) - inputMatrices[labelNodes[i]->NodeName()] = &dynamic_pointer_cast>(labelNodes[i])->Value(); - inputMatrices[L"numberobs"] = new Matrix(1, 1, m_net->GetDeviceId()); - - dataReader->StartMinibatchLoop(mbSize, 0, testSize); - m_net->StartEvaluateMinibatchLoop(criterionNodes, evaluationNodes); - - double epochEvalError = 0; - double epochCrossEntropy = 0; - size_t totalEpochSamples = 0; - double prevEpochEvalError = 0; - double prevEpochCrossEntropy = 0; - size_t prevTotalEpochSamples = 0; - size_t prevStart = 1; - size_t numSamples = 0; - double crossEntropy = 0; - double evalError = 0; - - ofstream outputStream; - if (output) - { -#ifdef _MSC_VER - outputStream.open(output); -#else - outputStream.open(wtocharpath(output).c_str()); // GCC does not implement wide-char pathnames here -#endif - } - - size_t numMBsRun = 0; - size_t actualMBSize = 0; - while (dataReader->GetMinibatch(inputMatrices)) - { - // TODO: we should use GetMinibatchIntoNetwork(), but it seems tricky. What is this for? - size_t nbrSamples = (size_t) (*inputMatrices[L"numberobs"])(0, 0); - actualMBSize = nbrSamples; - - for (int npos = 0; npos < nbrSamples; npos++) - { - featureNodes[npos]->BumpEvalTimeStamp(); - labelNodes[npos]->BumpEvalTimeStamp(); - - m_net->ForwardProp(criterionNodes[npos]); //use only the first criterion. Is there any possibility to use more? - - m_net->ForwardProp(evaluationNodes[npos]); - - double mbCrossEntropy = (double) criterionNodes[npos]->Get00Element(); // criterionNode should be a scalar - epochCrossEntropy += mbCrossEntropy; - - double mbEvalError = (double) evaluationNodes[npos]->Get00Element(); //criterionNode should be a scalar - - epochEvalError += mbEvalError; - } - - totalEpochSamples += actualMBSize; - - if (outputStream.is_open()) - { - //TODO: add support to dump multiple outputs - ComputationNodePtr outputNode = dynamic_pointer_cast>(m_net->OutputNodes()[0]); - foreach_column (j, outputNode->Value()) - { - foreach_row (i, outputNode->Value()) - outputStream << outputNode->Value()(i, j) << " "; - outputStream << endl; - } - } - - numMBsRun++; - if (numMBsRun % m_numMBsToShowResult == 0) - { - numSamples = (totalEpochSamples - prevTotalEpochSamples); - crossEntropy = epochCrossEntropy - prevEpochCrossEntropy; - evalError = epochEvalError - prevEpochEvalError; - - fprintf(stderr, "Minibatch[%lu-%lu]: Samples Evaluated = %lu EvalErr Per Sample = %.8g Loss Per Sample = %.8g\n", - prevStart, numMBsRun, numSamples, evalError / numSamples, crossEntropy / numSamples); - - prevTotalEpochSamples = totalEpochSamples; - prevEpochCrossEntropy = epochCrossEntropy; - prevEpochEvalError = epochEvalError; - prevStart = numMBsRun + 1; - } - } - - // show final grouping of output - numSamples = totalEpochSamples - prevTotalEpochSamples; - if (numSamples > 0) - { - crossEntropy = epochCrossEntropy - prevEpochCrossEntropy; - evalError = epochEvalError - prevEpochEvalError; - fprintf(stderr, "Minibatch[%lu-%lu]: Samples Evaluated = %lu EvalErr Per Sample = %.8g Loss Per Sample = %.8g\n", - prevStart, numMBsRun, numSamples, evalError / numSamples, crossEntropy / numSamples); - } - - //final statistics - epochEvalError /= (double) totalEpochSamples; - epochCrossEntropy /= (double) totalEpochSamples; - fprintf(stderr, "Overall: Samples Evaluated = %lu EvalErr Per Sample = %.8g Loss Per Sample = %.8g\n", totalEpochSamples, epochEvalError, epochCrossEntropy); - if (outputStream.is_open()) - { - outputStream.close(); - } - evalSetCrossEntropy = epochCrossEntropy; - return epochEvalError; - } - -public: - /// for encoder-decoder RNN - list> m_lst_pair_encoder_decode_node_names; - list> m_lst_pair_encoder_decoder_nodes; - - void SetEncoderDecoderNodePairs(std::list>& lst_pair_encoder_decoder_nodes) - { - m_lst_pair_encoder_decoder_nodes.clear(); - for (typename std::list>::iterator iter = lst_pair_encoder_decoder_nodes.begin(); iter != lst_pair_encoder_decoder_nodes.end(); iter++) - m_lst_pair_encoder_decoder_nodes.push_back(*iter); - } - - /** - this evaluates encoder network and decoder framework - only beam search decoding is applied to the last network - */ - double EvaluateEncoderDecoderWithHiddenStates( - vector nets, - vector*> dataReaders, - const size_t mbSize, - const size_t testSize = requestDataSize) - { - size_t iNumNets = nets.size(); - - ComputationNetworkPtr decoderNet = nullptr; - IDataReader* decoderDataReader = dataReaders[iNumNets - 1]; - decoderNet = nets[iNumNets - 1]; - - const auto& decoderEvaluationNodes = decoderNet->EvaluationNodes(); - - double evalResults = 0; - - vector*>*> inputMatrices; - for (auto ptr = nets.begin(); ptr != nets.end(); ptr++) - { - const auto& featNodes = (*ptr)->FeatureNodes(); - const auto& lablPtr = (*ptr)->LabelNodes(); - map*>* pMap = new map*>(); - for (auto pf = featNodes.begin(); pf != featNodes.end(); pf++) - { - (*pMap)[(*pf)->NodeName()] = &dynamic_pointer_cast>(*pf)->Value(); - } - for (auto pl = lablPtr.begin(); pl != lablPtr.end(); pl++) - { - (*pMap)[(*pl)->NodeName()] = &(dynamic_pointer_cast>(*pl)->Value()); - } - inputMatrices.push_back(pMap); - } - - //evaluate through minibatches - size_t totalEpochSamples = 0; - size_t numMBsRun = 0; - size_t actualMBSize = 0; - size_t numSamplesLastMBs = 0; - size_t lastMBsRun = 0; //MBs run before this display - - double evalResultsLastMBs = (double) 0; - - for (auto ptr = dataReaders.begin(); ptr != dataReaders.end(); ptr++) - (*ptr)->StartMinibatchLoop(mbSize, 0, testSize); - // BUGBUG: Code below will fail because we now must call StartMinibatchLoop(), but I can't tell from below which nodes to call it for. - //for (auto & ptr : nets) - // ptr->StartMinibatchLoop(xxx); - - bool bContinueDecoding = true; - while (bContinueDecoding) - { - - /// load data - auto pmat = inputMatrices.begin(); - bool bNoMoreData = false; - for (auto ptr = dataReaders.begin(); ptr != dataReaders.end(); ptr++, pmat++) - { - if ((*ptr)->GetMinibatch(*(*pmat)) == false) - { - bNoMoreData = true; - break; - } - } - if (bNoMoreData) - break; - - for (auto ptr = nets.begin(); ptr != nets.end(); ptr++) - { - const auto& featNodes = (*ptr)->FeatureNodes(); - ComputationNetwork::BumpEvalTimeStamp(featNodes); - } - - auto preader = dataReaders.begin(); - for (auto ptr = nets.begin(); ptr != nets.end(); ptr++, preader++) - { - actualMBSize = (*ptr)->DetermineActualMBSizeFromFeatures(); - if (actualMBSize == 0) - LogicError("decoderTrainSetDataReader read data but encoderNet reports no data read"); - (*preader)->CopyMBLayoutTo((*ptr)->GetMBLayoutPtr()); - (*ptr)->VerifyActualNumParallelSequences((*preader)->GetNumParallelSequences()); - - const auto& pairs = (*ptr)->PairNodes(); - for (auto ptr2 = pairs.begin(); ptr2 != pairs.end(); ptr2++) - (*ptr)->ForwardProp(*ptr2); - } - - decoderNet = nets[iNumNets - 1]; - /// not the sentence begining, because the initial hidden layer activity is from the encoder network - actualMBSize = decoderNet->DetermineActualMBSizeFromFeatures(); - if (actualMBSize == 0) - LogicError("decoderTrainSetDataReader read data but decoderNet reports no data read"); - decoderDataReader->CopyMBLayoutTo(decoderNet->GetMBLayoutPtr()); - decoderNet->VerifyActualNumParallelSequences(decoderDataReader->GetNumParallelSequences()); - - size_t i = 0; - assert(decoderEvaluationNodes.size() == 1); - if (decoderEvaluationNodes.size() != 1) - { - LogicError("Decoder should have only one evaluation node"); - } - - for (auto ptr = decoderEvaluationNodes.begin(); ptr != decoderEvaluationNodes.end(); ptr++, i++) - { - decoderNet->ForwardProp(*ptr); - if ((*ptr)->GetSampleLayout().GetNumElements() != 1) - LogicError("EvaluateEncoderDecoderWithHiddenStates: decoder evaluation should return a scalar value"); - - evalResults += (double) (*ptr)->Get00Element(); - } - - totalEpochSamples += actualMBSize; - numMBsRun++; - - if (m_traceLevel > 0) - { - numSamplesLastMBs += actualMBSize; - - if (numMBsRun % m_numMBsToShowResult == 0) - { - DisplayEvalStatistics(lastMBsRun + 1, numMBsRun, numSamplesLastMBs, decoderEvaluationNodes, evalResults, evalResultsLastMBs); - - evalResultsLastMBs = evalResults; - - numSamplesLastMBs = 0; - lastMBsRun = numMBsRun; - } - } - - /// call DataEnd to check if end of sentence is reached - /// datareader will do its necessary/specific process for sentence ending - for (auto ptr = dataReaders.begin(); ptr != dataReaders.end(); ptr++) - { - (*ptr)->DataEnd(endDataSentence); - } - } - - // show last batch of results - if (m_traceLevel > 0 && numSamplesLastMBs > 0) - { - DisplayEvalStatistics(lastMBsRun + 1, numMBsRun, numSamplesLastMBs, decoderEvaluationNodes, evalResults, evalResultsLastMBs); - } - - //final statistics - evalResultsLastMBs = 0; - - fprintf(stderr, "Final Results: "); - DisplayEvalStatistics(1, numMBsRun, totalEpochSamples, decoderEvaluationNodes, evalResults, evalResultsLastMBs, true); - - evalResults /= totalEpochSamples; - - for (auto ptr = inputMatrices.begin(); ptr != inputMatrices.end(); ptr++) - { - delete *ptr; - } - - return evalResults; - } - - // TODO: This stuff must all be removed from SimpleEvaluator, as this is not simple at all!! - void InitTrainEncoderDecoderWithHiddenStates(const ConfigParameters& readerConfig) - { - ConfigArray arrEncoderNodeNames = readerConfig(L"encoderNodes", ""); - vector encoderNodeNames; - - m_lst_pair_encoder_decode_node_names.clear(); - ; - - if (arrEncoderNodeNames.size() > 0) - { - /// newer code that explicitly place multiple streams for inputs - foreach_index (i, arrEncoderNodeNames) // inputNames should map to node names - { - wstring nodeName = arrEncoderNodeNames[i]; - encoderNodeNames.push_back(nodeName); - } - } - - ConfigArray arrDecoderNodeNames = readerConfig(L"decoderNodes", ""); - vector decoderNodeNames; - if (arrDecoderNodeNames.size() > 0) - { - /// newer code that explicitly place multiple streams for inputs - foreach_index (i, arrDecoderNodeNames) // inputNames should map to node names - { - wstring nodeName = arrDecoderNodeNames[i]; - decoderNodeNames.push_back(nodeName); - } - } - - assert(encoderNodeNames.size() == decoderNodeNames.size()); - - for (size_t i = 0; i < encoderNodeNames.size(); i++) - { - m_lst_pair_encoder_decode_node_names.push_back(make_pair(encoderNodeNames[i], decoderNodeNames[i])); - } - } - - void EncodingEvaluateDecodingBeamSearch( - vector nets, - vector*> readers, - IDataWriter& dataWriter, - const vector& evalNodeNames, - const vector& writeNodeNames, - const size_t mbSize, const double beam, const size_t testSize) - { - size_t iNumNets = nets.size(); - if (iNumNets < 2) - { - LogicError("Has to have at least two networks"); - } - - ComputationNetworkPtr decoderNet = nets[iNumNets - 1]; - IDataReader* encoderDataReader = readers[iNumNets - 2]; - IDataReader* decoderDataReader = readers[iNumNets - 1]; - vector& decoderFeatureNodes = decoderNet->FeatureNodes(); - - //specify output nodes and files - std::vector outputNodes; - for (auto ptr = evalNodeNames.begin(); ptr != evalNodeNames.end(); ptr++) - outputNodes.push_back(decoderNet->GetNodeFromName(*ptr)); - - //specify nodes to write to file - std::vector writeNodes; - for (int i = 0; i < writeNodeNames.size(); i++) - writeNodes.push_back(m_net->GetNodeFromName(writeNodeNames[i])); - - //prepare features and labels - std::map*> inputMatrices; - std::map*> decoderInputMatrices; - for (auto ptr = nets.begin(); ptr != nets.end() - 1; ptr++) - { - const auto& featNodes = (*ptr)->FeatureNodes(); - for (auto ptr2 = featNodes.begin(); ptr2 != featNodes.end(); ptr2++) - inputMatrices[(*ptr2)->NodeName()] = &dynamic_pointer_cast>(*ptr2)->Value(); - - const auto& lablNodes = (*ptr)->LabelNodes(); - for (auto ptr2 = lablNodes.begin(); ptr2 != lablNodes.end(); ptr2++) - inputMatrices[(*ptr2)->NodeName()] = &dynamic_pointer_cast>(*ptr2)->Value(); - } - - /// for the last network - auto ptr = nets.end() - 1; - const auto& featNodes = (*ptr)->FeatureNodes(); - for (auto ptr2 = featNodes.begin(); ptr2 != featNodes.end(); ptr2++) - decoderInputMatrices[(*ptr2)->NodeName()] = &dynamic_pointer_cast>(*ptr2)->Value(); - - const auto& lablNodes = (*ptr)->LabelNodes(); - for (auto ptr2 = lablNodes.begin(); ptr2 != lablNodes.end(); ptr2++) - decoderInputMatrices[(*ptr2)->NodeName()] = &dynamic_pointer_cast>(*ptr2)->Value(); - - //evaluate through minibatches - size_t totalEpochSamples = 0; - size_t actualMBSize = 0; - - for (auto ptr = readers.begin(); ptr != readers.end(); ptr++) - { - (*ptr)->StartMinibatchLoop(mbSize, 0, testSize); - (*ptr)->SetNumParallelSequences(1); - } - - Matrix historyMat(m_net->GetDeviceId()); - - bool bDecoding = true; - while (bDecoding) - { - bool noMoreData = false; - /// only get minibatch on the encoder parts of networks - size_t k = 0; - for (auto ptr = readers.begin(); ptr != readers.end() - 1; ptr++, k++) - { - if ((*ptr)->GetMinibatch(inputMatrices) == false) - { - noMoreData = true; - break; - } - } - if (noMoreData) - break; - - for (auto ptr = nets.begin(); ptr != nets.end() - 1; ptr++) - { - /// only on the encoder part of the networks - const auto& featNodes = (*ptr)->FeatureNodes(); - ComputationNetwork::BumpEvalTimeStamp(featNodes); - } - - auto ptrreader = readers.begin(); - size_t mNutt = 0; - for (auto ptr = nets.begin(); ptr != nets.end() - 1; ptr++, ptrreader++) - { - /// evaluate on the encoder networks - actualMBSize = (*ptr)->DetermineActualMBSizeFromFeatures(); - - mNutt = (*ptrreader)->GetNumParallelSequences(); - (*ptrreader)->CopyMBLayoutTo((*ptr)->GetMBLayoutPtr()); - (*ptr)->VerifyActualNumParallelSequences(mNutt); - - const auto& pairs = (*ptr)->PairNodes(); - for (auto ptr2 = pairs.begin(); ptr2 != pairs.end(); ptr2++) - (*ptr)->ForwardProp(*ptr2); - } - -/// not the sentence begining, because the initial hidden layer activity is from the encoder network -//decoderNet->ResizeAllFeatureNodes(actualMBSize); // BUGBUG: Function was deleted, but this may be necessary. -#if 0 // What this ^^ used to be: - // only called from MultiNetworksEvaluator - // a helper function for some places that like to hack the features directly - // This is for a few places (FindBestPath stuff) that don't follow the normal pattern but instead called the old SetFeaturesMiniBatchSize() function with a value of their choosing. - // This is now changed in that they must actually resize the features, and then the system takes it from here. - // UNTESTED stopgap. Most likely places that are never used. - // This function does not actually allocate the matrices. I don't know whether that currently happens correctly. - void ResizeAllFeatureNodes(size_t cols) - { - auto & featureNodes = FeatureNodes(); - for (auto & nodeIter : featureNodes) - nodeIter->SetNumCols(cols); - } - -#endif - //decoderNet->SetActualMiniBatchSizeFromFeatures(); - encoderDataReader->CopyMBLayoutTo(decoderNet->GetMBLayoutPtr()); - decoderNet->VerifyActualNumParallelSequences(mNutt); - - vector best_path; - FindBestPathWithVariableLength(decoderNet, actualMBSize, decoderDataReader, dataWriter, outputNodes, writeNodes, decoderFeatureNodes, beam, &decoderInputMatrices, best_path); - - totalEpochSamples += actualMBSize; - - /// call DataEnd to check if end of sentence is reached - /// datareader will do its necessary/specific process for sentence ending - for (auto ptr = readers.begin(); ptr != readers.end(); ptr++) - (*ptr)->DataEnd(endDataSentence); - } - } - - template - static inline bool comparator(const pair& l, const pair& r) - { - return l.second > r.second; - } - - bool GetCandidatesAtOneTimeInstance(const Matrix& score, - const double& preScore, const double& threshold, - const double& best_score_so_far, - vector>& rCandidate) - { - Matrix ptrScore(CPUDEVICE); - ptrScore = score; - - ElemType* pPointer = ptrScore.BufferPointer(); - vector> tPairs; - for (int i = 0; i < ptrScore.GetNumElements(); i++) - { - tPairs.push_back(make_pair(i, pPointer[i])); - // assert(pPointer[i] <= 1.0); /// work on the posterior probabilty, so every score should be smaller than 1.0 - } - - std::sort(tPairs.begin(), tPairs.end(), comparator); - - bool bAboveThreshold = false; - for (typename vector>::iterator itr = tPairs.begin(); itr != tPairs.end(); itr++) - { - if (itr->second < 0.0) - LogicError("This means to use probability so the value should be non-negative"); - - double dScore = (itr->second > (double) EPS_IN_LOG) ? log(itr->second) : (double) LOG_OF_EPS_IN_LOG; - - dScore += preScore; - if (dScore >= threshold && dScore >= best_score_so_far) - { - rCandidate.push_back(make_pair(itr->first, dScore)); - bAboveThreshold = true; - } - else - { - break; - } - } - - return bAboveThreshold; - } - - // retrieve activity at time atTime. - // notice that the function values returned is single column - void PreComputeActivityAtTime(size_t atTime) - { - for (auto nodeIter = batchComputeNodes.begin(); nodeIter != batchComputeNodes.end(); nodeIter++) - { - ComputationNodeBasePtr node = *nodeIter; - node->ForwardProp(FrameRange(node->GetMBLayout(), atTime)); - if (node->GetSampleMatrixNumCols() != node->GetNumParallelSequences()) - RuntimeError("preComputeActivityAtTime: the function values has to be a single column matrix "); - } - } - - // (only called by FindBestPath...()) - void ResetPreCompute() - { - //mark false - for (auto nodeIter = batchComputeNodes.begin(); nodeIter != batchComputeNodes.end(); nodeIter++) - { - auto node = static_pointer_cast>(*nodeIter); - node->MarkComputed(false); - } - } - - //return true if precomputation is executed. - bool EvaluateBatchModeNodes(ComputationNetwork& net, - const std::vector& featureNodes) - { - batchComputeNodes = net.GetNodesRequiringBatchMode(); - - if (batchComputeNodes.size() == 0) - { - return false; - } - - ComputationNetwork::BumpEvalTimeStamp(featureNodes); - - net.StartEvaluateMinibatchLoop(batchComputeNodes); // TODO: Is this correct? There is no StartMinibatchLoop() for a reader. - - //net.SetActualMiniBatchSizeFromFeatures(); - for (auto nodeIter = batchComputeNodes.begin(); nodeIter != batchComputeNodes.end(); nodeIter++) - net.ForwardProp(*nodeIter); - - //mark done - for (auto nodeIter = batchComputeNodes.begin(); nodeIter != batchComputeNodes.end(); nodeIter++) - { - auto node = static_pointer_cast>(*nodeIter); - node->MarkComputed(true); - } - - return true; - } - - void WriteNbest(const size_t nidx, const vector& best_path, - const std::vector& outputNodes, IDataWriter& dataWriter) - { - assert(outputNodes.size() == 1); - std::map outputMatrices; - size_t bSize = best_path.size(); - for (int i = 0; i < outputNodes.size(); i++) - { -#if 0 // This call no longer exists. This must be updated to make it functional again. - outputNodes[i]->SetNumCols(bSize); -#endif - dynamic_pointer_cast>(outputNodes[i])->UpdateFunctionValuesSize(); - dynamic_pointer_cast>(outputNodes[i])->Value().SetValue(0); - for (int k = 0; k < bSize; k++) - dynamic_pointer_cast>(outputNodes[i])->Value().SetValue(best_path[k], k, 1.0); - outputMatrices[outputNodes[i]->NodeName()] = (void*) (&dynamic_pointer_cast>(outputNodes[i])->Value()); - // TODO: void* --really? - } - - dataWriter.SaveData(nidx, outputMatrices, bSize, bSize, 0); - } - - void BeamSearch(IDataReader* dataReader, IDataWriter& dataWriter, const vector& outputNodeNames, const vector& writeNodeNames, const size_t mbSize, const double beam, const size_t testSize) - { - clock_t startReadMBTime = 0, endComputeMBTime = 0; - - //specify output nodes and files - std::vector outputNodes; - for (int i = 0; i < outputNodeNames.size(); i++) - outputNodes.push_back(m_net->GetNodeFromName(outputNodeNames[i])); - - //specify nodes to write to file - std::vector writeNodes; - for (int i = 0; i < writeNodeNames.size(); i++) - writeNodes.push_back(m_net->GetNodeFromName(writeNodeNames[i])); - - //prepare features and labels - /*const*/ auto& featureNodes = m_net->FeatureNodes(); - const auto& labelNodes = m_net->LabelNodes(); - - std::map*> inputMatrices; - for (size_t i = 0; i < featureNodes.size(); i++) - inputMatrices[featureNodes[i]->NodeName()] = &dynamic_pointer_cast>(featureNodes[i])->Value(); - for (size_t i = 0; i < labelNodes.size(); i++) - inputMatrices[labelNodes[i]->NodeName()] = &dynamic_pointer_cast>(labelNodes[i])->Value(); - - //evaluate through minibatches - size_t totalEpochSamples = 0; - size_t actualMBSize = 0; - - dataReader->StartMinibatchLoop(mbSize, 0, testSize); - dataReader->SetNumParallelSequences(1); - - startReadMBTime = clock(); - size_t numMBsRun = 0; - double ComputeTimeInMBs = 0; - while (DataReaderHelpers::GetMinibatchIntoNetwork(*dataReader, m_net, nullptr, false, false, inputMatrices, actualMBSize)) - { - // note: GetMinibatchIntoNetwork() will also fetch the MBLayout although we don't need ithere. This should not hurt. - ComputationNetwork::BumpEvalTimeStamp(featureNodes); - //actualMBSize = m_net->SetActualMiniBatchSizeFromFeatures(); - - vector best_path; - - FindBestPath(m_net, dataReader, - dataWriter, outputNodes, - writeNodes, featureNodes, - beam, &inputMatrices, best_path); - - totalEpochSamples += actualMBSize; - - /// call DataEnd to check if end of sentence is reached - /// datareader will do its necessary/specific process for sentence ending - dataReader->DataEnd(endDataSentence); - - endComputeMBTime = clock(); - numMBsRun++; - - if (m_traceLevel > 0) - { - double MBComputeTime = (double) (endComputeMBTime - startReadMBTime) / CLOCKS_PER_SEC; - - ComputeTimeInMBs += MBComputeTime; - - fprintf(stderr, "Sentences Seen = %zd; Samples seen = %zd; Total Compute Time = %.8g ; Time Per Sample=%.8g\n", numMBsRun, totalEpochSamples, ComputeTimeInMBs, ComputeTimeInMBs / totalEpochSamples); - } - - startReadMBTime = clock(); - } - - fprintf(stderr, "done decoding\n"); - } - - void FindBestPath(ComputationNetworkPtr evalnet, - IDataReader* dataReader, IDataWriter& dataWriter, - const std::vector& evalNodes, - const std::vector& outputNodes, - /*const*/ std::vector& featureNodes, - const double beam, - std::map*>* inputMatrices, - vector& best_path) - { - assert(evalNodes.size() == 1); - - NN_state state; - NN_state null_state; - - priority_queue> n_bests; /// save n-bests - - /** - loop over all the candidates for the featureDelayTarget, - evaluate their scores, save their histories - */ - priority_queue> from_queue, to_queue; - vector evalResults; - - /// use reader to initialize evalnet's sentence start information to let it know that this - /// is the begining of sentence - size_t mbSize = evalnet->DetermineActualMBSizeFromFeatures(); - dataReader->CopyMBLayoutTo(evalnet->GetMBLayoutPtr()); - evalnet->VerifyActualNumParallelSequences(dataReader->GetNumParallelSequences()); - - size_t maxMbSize = 2 * mbSize; - - clock_t start, now; - start = clock(); - - /// for the case of not using encoding, no previous state is avaliable, except for the default hidden layer activities - /// no need to get that history and later to set the history as there are default hidden layer activities - - from_queue.push(Token(0., vector(), state)); /// the first element in the priority queue saves the initial NN state - - dataReader->InitProposals(inputMatrices); - size_t itdx = 0; - size_t maxSize = min(maxMbSize, mbSize); - - ResetPreCompute(); - EvaluateBatchModeNodes(*evalnet, featureNodes); - -/// need to set the minibatch size to 1, and initialize evalnet's sentence start information to let it know that this -/// is the begining of sentence -#if 0 // This call no longer exists. This must be updated to make it functional again. - for (auto ptr = featureNodes.begin(); ptr != featureNodes.end(); ptr++) - (*ptr)->SetNumCols(1); -#endif - // TODO: ^^ this is the same as ResizeAllFeatureNodes() if featureNodes == evalnet.FeatureNodes(). Is it? - //evalnet->SetActualMiniBatchSizeFromFeatures(); - - dataReader->CopyMBLayoutTo(evalnet->GetMBLayoutPtr()); // TODO: should this be one column only? - /// need to set the sentence begining segmentation info - // BUGBUG: I commented this out because these flags no longer exist. This code is no longer functional. - //evalnet->GetMBLayoutPtr()->GetM().SetValue(((int) MinibatchPackingFlags::SequenceStart)); - - for (itdx = 0; itdx < maxSize; itdx++) - { - double best_score = -numeric_limits::infinity(); - vector best_output_label; - - if (itdx > 0) - { - /// state need to be carried over from past time instance - // BUGBUG: I commented this out because these flags no longer exist. This code is no longer functional. [fseide] - //evalnet->GetMBLayoutPtr()->GetM().SetValue(((int) MinibatchPackingFlags::None)); - } - - PreComputeActivityAtTime(itdx); - - while (!from_queue.empty()) - { - const Token from_token = from_queue.top(); - vector history = from_token.sequence; - - /// update feature nodes once, as the observation is the same for all propsoals in labels - ComputationNetwork::BumpEvalTimeStamp(featureNodes); - - /// history is updated in the getproposalobs function - dataReader->GetProposalObs(inputMatrices, itdx, history); - - /// get the nn state history and set nn state to the history - map> hidden_history = from_token.state.hidden_activity; - evalnet->SetHistory(hidden_history); - - for (int i = 0; i < evalNodes.size(); i++) - { - evalnet->ForwardProp(evalNodes[i]); - vector> retPair; - if (GetCandidatesAtOneTimeInstance(dynamic_pointer_cast>(evalNodes[i])->Value(), from_token.score, best_score - beam, -numeric_limits::infinity(), retPair) == false) - continue; - - evalnet->GetHistory(state.hidden_activity, true); - for (typename vector>::iterator itr = retPair.begin(); itr != retPair.end(); itr++) - { - vector history = from_token.sequence; - history.push_back(itr->first); - Token to_token(itr->second, history, state); /// save updated nn state and history - - to_queue.push(to_token); - - if (itr->second > best_score) /// update best score - { - best_score = itr->second; - best_output_label = history; - } - } - - history = from_token.sequence; /// back to the from token's history - } - - from_queue.pop(); - } - - if (to_queue.size() == 0) - break; - - // beam pruning - const double threshold = best_score - beam; - while (!to_queue.empty()) - { - if (to_queue.top().score >= threshold) - from_queue.push(to_queue.top()); - to_queue.pop(); - } - } - - // write back best path - size_t ibest = 0; - while (from_queue.size() > 0) - { - Token seq(from_queue.top().score, from_queue.top().sequence, from_queue.top().state); - - best_path.clear(); - - assert(best_path.empty()); - best_path = seq.sequence; - if (ibest == 0) - WriteNbest(ibest, best_path, outputNodes, dataWriter); - -#ifdef DBG_BEAM_SEARCH - WriteNbest(ibest, best_path, outputNodes, dataWriter); - cout << " score = " << from_queue.top().score << endl; -#endif - - from_queue.pop(); - - ibest++; - } - - now = clock(); - fprintf(stderr, "%.1f words per second\n", mbSize / ((double) (now - start) / 1000.0)); - } - - /** - beam search decoder - */ - double FindBestPathWithVariableLength(ComputationNetworkPtr evalnet, - size_t inputLength, - IDataReader* dataReader, - IDataWriter& dataWriter, - std::vector& evalNodes, - std::vector& outputNodes, - std::vector& featureNodes, - const double beam, - std::map*>* inputMatrices, - vector& best_path) - { - assert(evalNodes.size() == 1); - - NN_state state; - NN_state null_state; - - std::priority_queue> n_bests; /// save n-bests - - /** - loop over all the candidates for the featuredelayTarget, - evaluate their scores, save their histories - */ - std::priority_queue> from_queue, to_queue; - std::priority_queue> result_queue; - vector evalResults; - - size_t mbSize = inputLength; - /// use reader to initialize evalnet's sentence start information to let it know that this - /// is the beginning of sentence - //evalnet->ResizeAllFeatureNodes(mbSize); // BUGBUG: Function was deleted, but this may be necessary. - //evalnet->SetActualMiniBatchSizeFromFeatures(); - // TODO: not setting MBLayout? - evalnet->VerifyActualNumParallelSequences(dataReader->GetNumParallelSequences()); - // TODO: This is UNTESTED; if it fails, change ^^ this back to SetActual...() - - size_t maxMbSize = 3 * mbSize; -#ifdef _DEBUG - maxMbSize = 2; -#endif - - clock_t start, now; - start = clock(); - - from_queue.push(Token(0., vector(), state)); /// the first element in the priority queue saves the initial NN state - - /// the end of sentence symbol in reader - int outputEOS = dataReader->GetSentenceEndIdFromOutputLabel(); - if (outputEOS < 0) - LogicError("Cannot find end of sentence symbol. Check "); - - dataReader->InitProposals(inputMatrices); - - size_t itdx = 0; - - ResetPreCompute(); - EvaluateBatchModeNodes(*evalnet, featureNodes); - - /// need to set the minibatch size to 1, and initialize evalnet's sentence start information to let it know that this - /// is the begining of sentence - // BUGBUG: This is almost certainly wrong; slice != MB size - //evalnet->SetActualMiniBatchSize(dataReader->GetNumParallelSequences()); - //evalnet->ResizeAllFeatureNodes(1); // BUGBUG: Function was deleted, but this may be necessary. - //evalnet->SetActualMiniBatchSizeFromFeatures(); - - double best_score = -numeric_limits::infinity(); - double best_score_so_far = -numeric_limits::infinity(); - - // BUGBUG: I commented this out because these flags no longer exist. This code is no longer functional. - //evalnet->GetMBLayoutPtr()->GetM().SetValue(((int) MinibatchPackingFlags::SequenceStart)); // BUGBUG: huh? How can the entire batch be start frames? - - for (itdx = 0; itdx < maxMbSize; itdx++) - { - double best_score = -numeric_limits::infinity(); - vector best_output_label; - - if (itdx > 0) - { - /// state need to be carried over from past time instance - // BUGBUG: I commented this out because these flags no longer exist. This code is no longer functional. - //evalnet->GetMBLayoutPtr()->GetM().SetValue(((int) MinibatchPackingFlags::None)); - } - - PreComputeActivityAtTime(itdx); - - while (!from_queue.empty()) - { - const Token from_token = from_queue.top(); - vector history = from_token.sequence; - - /// update feature nodes once, as the observation is the same for all propsoals in labels - ComputationNetwork::BumpEvalTimeStamp(featureNodes); - - /// history is updated in the getproposalobs function - dataReader->GetProposalObs(inputMatrices, itdx, history); - - /// get the nn state history and set nn state to the history - map> hidden_history = from_token.state.hidden_activity; - evalnet->SetHistory(hidden_history); - - for (int i = 0; i < evalNodes.size(); i++) - { - evalnet->ForwardProp(evalNodes[i]); - vector> retPair; - if (GetCandidatesAtOneTimeInstance(dynamic_pointer_cast>(evalNodes[i])->Value(), - from_token.score, best_score - beam, -numeric_limits::infinity(), retPair) == false) // ==false??? !(.)? - continue; - - evalnet->GetHistory(state.hidden_activity, true); - for (typename vector>::iterator itr = retPair.begin(); itr != retPair.end(); itr++) - { - vector history = from_token.sequence; - history.push_back(itr->first); - - if (itr->first != outputEOS) - { - Token to_token(itr->second, history, state); /// save updated nn state and history - - to_queue.push(to_token); - - if (itr->second > best_score) /// update best score - { - best_score = itr->second; - best_output_label = history; - } - } - else - { - /// sentence ending reached - Token to_token(itr->second, history, state); - result_queue.push(to_token); - } - } - - history = from_token.sequence; /// back to the from token's history - } - - from_queue.pop(); - } - - if (to_queue.size() == 0) - break; - - // beam pruning - const double threshold = best_score - beam; - while (!to_queue.empty()) - { - if (to_queue.top().score >= threshold) - from_queue.push(to_queue.top()); - to_queue.pop(); - } - - best_score_so_far = best_score; - } - - // write back best path - size_t ibest = 0; - while (result_queue.size() > 0) - { - best_path.clear(); - //vector *p = &result_queue.top().sequence; - assert(best_path.empty()); - best_path.swap(const_cast&>(result_queue.top().sequence)); - { - double score = result_queue.top().score; - best_score = score; - fprintf(stderr, "best[%zd] score = %.4e\t", ibest, score); - if (best_path.size() > 0) - WriteNbest(ibest, best_path, outputNodes, dataWriter); - } - - ibest++; - - result_queue.pop(); - break; /// only output the top one - } - - now = clock(); - fprintf(stderr, "%.1f words per second\n", mbSize / ((double) (now - start) / 1000.0)); - - return best_score; - } - -protected: - /// used for backward directional nodes - std::list batchComputeNodes; -}; -} } } diff --git a/Source/SGDLib/MultiNetworksSGD.h b/Source/SGDLib/MultiNetworksSGD.h deleted file mode 100644 index 166f757573db..000000000000 --- a/Source/SGDLib/MultiNetworksSGD.h +++ /dev/null @@ -1,1269 +0,0 @@ -// -// Copyright (c) Microsoft. All rights reserved. -// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. -// -// MultiNetworksEvaluator/SGD -- This represents earlier efforts to use CNTK for sequence-to-sequence modeling. This is no longer the intended design. -// -#pragma once - -// TODO: this cannot be instantiated as a whole (compile error), although some function is called from CNTK.cpp--should be fixed - -#include "Basics.h" -#include "ComputationNetwork.h" -#include "IComputationNetBuilder.h" -#include "SimpleEvaluator.h" -#include "MultiNetworksEvaluator.h" -#include "DataReader.h" -#include -#include -#include -#include "fileutil.h" -#include "Config.h" -#include -#include -#include "TimerUtility.h" -#include "SGD.h" - -using namespace std; - -namespace Microsoft { namespace MSR { namespace CNTK { - -extern std::wstring GetEncoderModelNameForEpoch(int epoch, bool b = false); -extern std::wstring GetDecoderModelNameForEpoch(int epoch, bool b = false); - -template -class MultiNetworksSGD : SGD -{ - ElemType m_default_activity; - - using SGDBase = SGD; - -public: - // TODO: use a macro similar to class ComputeNode - using SGDBase::m_modelPath; - using SGDBase::m_maxEpochs; - using SGDBase::m_doUnitTest; - using SGDBase::m_learnRateAdjustInterval; - using SGDBase::m_mbSize; - using SGDBase::m_momentumParam; - using SGDBase::m_learningRatesParam; - using SGDBase::GetLearningRatePerSample; - using SGDBase::GetMomentumPerSample; - using SGDBase::m_dropoutRates; - using SGDBase::m_autoLearnRateSearchType; - using SGDBase::m_minLearnRate; - using SGDBase::m_loadBestModel; - //using SGDBase::m_validateAfterModelReloading; - using SGDBase::m_continueReduce; - using SGDBase::m_reduceLearnRateIfImproveLessThan; - using SGDBase::m_epochSize; - using SGDBase::m_learnRateDecreaseFactor; - using SGDBase::m_increaseLearnRateIfImproveMoreThan; - using SGDBase::m_learnRateIncreaseFactor; - using SGDBase::m_keepCheckPointFiles; - using SGDBase::m_doGradientCheck; - using SGDBase::m_L2RegWeight; - using SGDBase::m_L1RegWeight; - using SGDBase::m_needAveMultiplier; - using SGDBase::m_useNesterovMomentum; - using SGDBase::m_traceLevel; - using SGDBase::m_numMBsToShowResult; - using SGDBase::m_gradientCheckSigDigit; - using SGDBase::m_prevChosenMinibatchSize; - using SGDBase::UpdateWeights; - using SGDBase::GetCheckPointFileNameForEpoch; - using SGDBase::GetTrainCriterionNodes; - using SGDBase::GetEvalCriterionNodes; - - typedef shared_ptr> ComputationNodePtr; - - /// for encoder and decoder nodes pairing - wstring m_decoderModelPath; - wstring m_backwardDecoderModelPath; - wstring m_encoderModelPath; - - list> m_lst_pair_encoder_decode_node_names; - list> m_lst_pair_encoder_decoder_nodes; - -public: - MultiNetworksSGD(const ConfigParameters& configSGD) - : SGDBase(configSGD) - { - } - - ~MultiNetworksSGD() - { - } - - void InitTrainEncoderDecoderWithHiddenStates(const ConfigParameters& readerConfig) - { - - m_decoderModelPath = m_modelPath + L".decoder"; - m_backwardDecoderModelPath = m_modelPath + L".backward.decoder"; - m_encoderModelPath = m_modelPath + L".encoder"; - - ConfigArray arrEncoderNodeNames = readerConfig(L"encoderNodes", ""); - vector encoderNodeNames; - m_lst_pair_encoder_decode_node_names.clear(); - - if (arrEncoderNodeNames.size() > 0) - { - /// newer code that explicitly place multiple streams for inputs - foreach_index (i, arrEncoderNodeNames) // inputNames should map to node names - { - wstring nodeName = arrEncoderNodeNames[i]; - encoderNodeNames.push_back(nodeName); - } - } - - ConfigArray arrDecoderNodeNames = readerConfig(L"decoderNodes", ""); - vector decoderNodeNames; - if (arrDecoderNodeNames.size() > 0) - { - /// newer code that explicitly place multiple streams for inputs - foreach_index (i, arrDecoderNodeNames) // inputNames should map to node names - { - wstring nodeName = arrDecoderNodeNames[i]; - decoderNodeNames.push_back(nodeName); - } - } - - assert(encoderNodeNames.size() == decoderNodeNames.size()); - - for (size_t i = 0; i < encoderNodeNames.size(); i++) - { - m_lst_pair_encoder_decode_node_names.push_back(make_pair(encoderNodeNames[i], decoderNodeNames[i])); - fprintf(stderr, "paired %ls <-> %ls\n", encoderNodeNames[i].c_str(), decoderNodeNames[i].c_str()); - } - } - - void EncoderDecoder(vector*> netBuilder, DEVICEID_TYPE deviceId, - vector*> trainSetDataReader, - vector*> validationSetDataReader, - const bool makeMode) - { - if (validationSetDataReader.size() == 0) - InvalidArgument("validation set reader should not be null."); - - int startEpoch = DetermineEncoderDecoderStartEpoch(makeMode); - if (startEpoch == m_maxEpochs) - { - fprintf(stderr, "No further training is necessary.\n"); - return; - } - - size_t iNumNetworks = netBuilder.size(); - vector nets; - ComputationNetworkPtr eachNet; - for (size_t k = 0; k < iNumNetworks; k++) - { - wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1, false, msra::strfun::wstrprintf(L".%d", k)); - fprintf(stderr, "network model FileName=%ls\n", modelFileName.c_str()); - if (startEpoch >= 0) - fprintf(stderr, "Starting from checkpoint. Load Network From File %ls.\n", modelFileName.c_str()); - if (k == 0) - { - eachNet = - startEpoch < 0 ? netBuilder[k]->BuildNetworkFromDescription() : ComputationNetwork::CreateFromFile(deviceId, modelFileName, FileOptions::fileOptionsBinary, true /*bAllowNoCriterionNode*/); - nets.push_back(eachNet); - } - else - { - eachNet = - startEpoch < 0 ? netBuilder[k]->BuildNetworkFromDescription(nets[k - 1].get()) : ComputationNetwork::CreateFromFile(deviceId, modelFileName, FileOptions::fileOptionsBinary, false /*bAllowNoCriterionNode*/, nets[k - 1].get()); - nets.push_back(eachNet); - } - } - - startEpoch = max(startEpoch, 0); - - if (m_doUnitTest) - { - if (nets[iNumNetworks - 1]->UnitTest() == false) - LogicError("unit test on decoder network not passed"); - - return; - } - - fprintf(stderr, "start training ...\n"); - TrainEncoderDecoderModel(startEpoch, nets, trainSetDataReader, validationSetDataReader); - } - - //return -1 if nothing exists - int DetermineEncoderDecoderStartEpoch(const bool makeMode) - { - if (!makeMode) - return -1; //always start from scratch - - int firstEpoch = -1; - - wstring curEpochFile = GetModelNameForEpoch(int(m_maxEpochs) - 1, false, L".0"); - for (int e = int(m_maxEpochs) - 1; e >= -1; e--) - { - const wstring prevEpochFile = GetModelNameForEpoch(e - 1, false, L".0"); - - if (msra::files::fuptodate(curEpochFile, prevEpochFile, false)) - { - firstEpoch = size_t(e) + 1; - break; - } - else - curEpochFile = prevEpochFile; - } - - return firstEpoch; - } - - wstring GetModelNameForEpoch(const int epoch, bool bLastModel = false, wstring ext = L"") - { - int epoch1Base = epoch + 1; - if (epoch1Base == m_maxEpochs || bLastModel) - return m_modelPath + ext; - else - return msra::strfun::wstrprintf(L"%s%s.%d", m_modelPath.c_str(), ext.c_str(), (int) epoch1Base); - } - - void TrainEncoderDecoderModel(int startEpoch, ComputationNetworkPtr encoderNet, - ComputationNetworkPtr decoderNet, - IDataReader* encoderTrainSetDataReader, - IDataReader* decoderTrainSetDataReader, - IDataReader* encoderValidationSetDataReader, - IDataReader* decoderValidationSetDataReader) - { - std::vector& encoderFeatureNodes = encoderNet->FeatureNodes(); - std::vector& encoderEvaluationNodes = encoderNet->OutputNodes(); - - std::vector& decoderFeatureNodes = decoderNet->FeatureNodes(); - std::vector& decoderLabelNodes = decoderNet->LabelNodes(); - std::vector& decoderCriterionNodes = GetTrainCriterionNodes(*decoderNet); - std::vector& decoderEvaluationNodes = GetEvalCriterionNodes(*decoderNet); - - std::map *> encoderInputMatrices, decoderInputMatrices; - for (size_t i = 0; i < encoderFeatureNodes.size(); i++) - encoderInputMatrices[encoderFeatureNodes[i]->NodeName()] = &dynamic_pointer_cast>(encoderFeatureNodes[i])->Value(); - for (size_t i = 0; i < decoderFeatureNodes.size(); i++) - decoderInputMatrices[decoderFeatureNodes[i]->NodeName()] = &dynamic_pointer_cast>(decoderFeatureNodes[i])->Value(); - for (size_t i = 0; i < decoderLabelNodes.size(); i++) - decoderInputMatrices[decoderLabelNodes[i]->NodeName()] = &dynamic_pointer_cast>(decoderLabelNodes[i])->Value(); - - //initializing weights and gradient holder - const std::list& encoderLearnableNodes = encoderNet->LearnableParameterNodes(encoderEvaluationNodes[0]); //only one criterion so far TODO: support multiple ones? - const std::list& decoderLearnableNodes = decoderNet->LearnableParameterNodes(decoderCriterionNodes[0]); - std::list learnableNodes; - for (auto nodeIter = encoderLearnableNodes.begin(); nodeIter != encoderLearnableNodes.end(); nodeIter++) - learnableNodes.push_back(*nodeIter); - for (auto nodeIter = decoderLearnableNodes.begin(); nodeIter != decoderLearnableNodes.end(); nodeIter++) - learnableNodes.push_back(*nodeIter); - - std::list> smoothedGradients; -#if 0 // No longer functional due to lack of GetNumCols(). - for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++) - { - ComputationNodePtr node = dynamic_pointer_cast>(*nodeIter); - smoothedGradients.push_back(Matrix(node->GetNumRows(), node->GetNumCols(), node->Value().GetDeviceId())); - } -#endif - - vector epochCriterion; - double avgCriterion, prevCriterion; - for (size_t i = 0; i < 2; i++) - epochCriterion.push_back(std::numeric_limits::infinity()); - avgCriterion = prevCriterion = std::numeric_limits::infinity(); - - size_t epochsNotCountedInAvgCriterion = startEpoch % m_learnRateAdjustInterval; - - std::vector epochEvalErrors(decoderEvaluationNodes.size(), std::numeric_limits::infinity()); - - std::vector evalNodeNames; - for (size_t i = 0; i < decoderEvaluationNodes.size(); i++) - evalNodeNames.push_back(decoderEvaluationNodes[i]->NodeName()); - - size_t totalSamplesSeen = 0; - double learnRatePerSample = 0.5f / m_mbSize[startEpoch]; - - int m_numPrevLearnRates = 5; //used to control the upper learnining rate in LR search to reduce computation - vector prevLearnRates; - prevLearnRates.resize(m_numPrevLearnRates); - for (int i = 0; i < m_numPrevLearnRates; i++) - prevLearnRates[i] = std::numeric_limits::infinity(); - - //precompute mean and invStdDev nodes and save initial model - if ( /// to-do doesn't support pre-compute such as MVN here - /// PreCompute(net, encoderTrainSetDataReader, encoderFeatureNodes, encoderlabelNodes, encoderInputMatrices) || - startEpoch == 0) - { - encoderNet->Save(GetEncoderModelNameForEpoch(int(startEpoch) - 1)); - decoderNet->Save(GetDecoderModelNameForEpoch(int(startEpoch) - 1)); - } - - bool learnRateInitialized = false; - if (startEpoch > 0) - learnRateInitialized = this->LoadCheckPointInfo(startEpoch - 1, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, m_prevChosenMinibatchSize); - - if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && !learnRateInitialized && m_learningRatesParam.size() <= startEpoch) - InvalidArgument("When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, or an explicit learning rate must be specified in config for the starting epoch."); - - ULONG dropOutSeed = 1; - double prevDropoutRate = 0; - - bool learnRateReduced = false; - - for (int i = int(startEpoch); i < int(m_maxEpochs); i++) - { - auto t_start_epoch = clock(); - - //set dropout rate - ComputationNetwork::SetDropoutRate(*encoderNet, encoderEvaluationNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed); - ComputationNetwork::SetDropoutRate(*decoderNet, decoderCriterionNodes[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed); - - //learning rate adjustment - if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || (m_learningRatesParam.size() > 0 && m_learningRatesParam.size() > i)) - { - learnRatePerSample = GetLearningRatePerSample(i /*BUGBUG workaround:*/, encoderTrainSetDataReader->GetNumParallelSequences()); - } - else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch) - { - NOT_IMPLEMENTED; - } - - learnRateInitialized = true; - - if (learnRatePerSample < m_minLearnRate) - { - fprintf(stderr, "Learn Rate Per Sample for Epoch[%lu] = %.8g is less than minLearnRate %.8g. Training stops.\n", i + 1, learnRatePerSample, m_minLearnRate); - break; - } - - TrainOneEpochEncoderDecoderWithHiddenStates(encoderNet, decoderNet, i, - m_epochSize, encoderTrainSetDataReader, - decoderTrainSetDataReader, learnRatePerSample, - encoderFeatureNodes, encoderEvaluationNodes, &encoderInputMatrices, - decoderFeatureNodes, decoderLabelNodes, decoderCriterionNodes, decoderEvaluationNodes, - &decoderInputMatrices, learnableNodes, smoothedGradients, - epochCriterion, epochEvalErrors, totalSamplesSeen); - - auto t_end_epoch = clock(); - double epochTime = 1.0 * (t_end_epoch - t_start_epoch) / (CLOCKS_PER_SEC); - - // fprintf(stderr, "Finished Epoch[%lu]: [Training Set] Train Loss Per Sample = %.8g ", i + 1, epochCriterion); - fprintf(stderr, "Finished Epoch[%lu]: [Training Set] Decoder Train Loss Per Sample = %.8g ", i + 1, epochCriterion[0]); - if (epochEvalErrors.size() == 1) - { - fprintf(stderr, "EvalErr Per Sample = %.8g Ave Learn Rate Per Sample = %.10g Epoch Time=%.8g\n", epochEvalErrors[0], learnRatePerSample, epochTime); - } - else - { - fprintf(stderr, "EvalErr Per Sample "); - for (size_t j = 0; j < epochEvalErrors.size(); j++) - fprintf(stderr, "[%lu]=%.8g ", j, epochEvalErrors[j]); - fprintf(stderr, "Ave Learn Rate Per Sample = %.10g Epoch Time=%.8g\n", learnRatePerSample, epochTime); - fprintf(stderr, "Finished Epoch[%lu]: Criterion Node [%ls] Per Sample = %.8g\n", i + 1, decoderCriterionNodes[0]->NodeName().c_str(), epochCriterion[i + 1]); - for (size_t j = 0; j < epochEvalErrors.size(); j++) - fprintf(stderr, "Finished Epoch[%lu]: Evaluation Node [%ws] Per Sample = %.8g\n", i + 1, evalNodeNames[j].c_str(), epochEvalErrors[j]); - } - - if (decoderValidationSetDataReader != decoderTrainSetDataReader && decoderValidationSetDataReader != nullptr && - encoderValidationSetDataReader != encoderTrainSetDataReader && encoderValidationSetDataReader != nullptr) - { - SimpleEvaluator evalforvalidation(*decoderNet); - vector cvEncoderSetTrainAndEvalNodes; - cvEncoderSetTrainAndEvalNodes.push_back(encoderEvaluationNodes[0]->NodeName()); - - vector cvDecoderSetTrainAndEvalNodes; - cvDecoderSetTrainAndEvalNodes.push_back(decoderCriterionNodes[0]->NodeName()); - cvDecoderSetTrainAndEvalNodes.push_back(decoderEvaluationNodes[0]->NodeName()); - - vector vScore = evalforvalidation.EvaluateEncoderDecoderWithHiddenStates( - encoderNet, decoderNet, - encoderValidationSetDataReader, - decoderValidationSetDataReader, cvEncoderSetTrainAndEvalNodes, - cvDecoderSetTrainAndEvalNodes, m_mbSize[i]); - fprintf(stderr, "Finished Epoch[%lu]: [Validation Set] Train Loss Per Sample = %.8g EvalErr Per Sample = %.8g\n", - i + 1, vScore[0], vScore[1]); - - epochCriterion[0] = vScore[0]; //the first one is the decoder training criterion. - } - - bool loadedPrevModel = false; - size_t epochsSinceLastLearnRateAdjust = i % m_learnRateAdjustInterval + 1; - if (avgCriterion == std::numeric_limits::infinity()) - avgCriterion = epochCriterion[0]; - else - avgCriterion = ((epochsSinceLastLearnRateAdjust - 1 - epochsNotCountedInAvgCriterion) * avgCriterion + epochCriterion[0]) / (epochsSinceLastLearnRateAdjust - epochsNotCountedInAvgCriterion); - - if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && m_learningRatesParam.size() <= i && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval) - { - if (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits::infinity()) - { - if (m_loadBestModel) - { - encoderNet->RereadPersistableParameters(GetEncoderModelNameForEpoch(i - 1)); - decoderNet->RereadPersistableParameters(GetDecoderModelNameForEpoch(i - 1)); - - size_t dummyMinibatchSize = 0; - this->LoadCheckPointInfo(i - 1, - /*out*/ totalSamplesSeen, - /*out*/ learnRatePerSample, - smoothedGradients, - /*out*/ prevCriterion, - /*out*/ dummyMinibatchSize); - fprintf(stderr, "Loaded the previous model which has better training criterion.\n"); - loadedPrevModel = true; - } - } - - if (m_continueReduce) - { - if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits::infinity()) - { - if (learnRateReduced == false) - { - learnRateReduced = true; - } - else - { - decoderNet->Save(GetDecoderModelNameForEpoch(i, true)); - encoderNet->Save(GetEncoderModelNameForEpoch(i, true)); - fprintf(stderr, "Finished training and saved final model\n\n"); - break; - } - } - if (learnRateReduced) - { - learnRatePerSample *= m_learnRateDecreaseFactor; - fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample); - } - } - else - { - if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits::infinity()) - { - - learnRatePerSample *= m_learnRateDecreaseFactor; - fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample); - } - else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan * prevCriterion && prevCriterion != std::numeric_limits::infinity()) - { - learnRatePerSample *= m_learnRateIncreaseFactor; - fprintf(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample); - } - } - } - - if (!loadedPrevModel && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval) //not loading previous values then set them - { - prevCriterion = avgCriterion; - epochsNotCountedInAvgCriterion = 0; - } - - //persist model and check-point info - decoderNet->Save(GetDecoderModelNameForEpoch(i)); - encoderNet->Save(GetEncoderModelNameForEpoch(i)); - - size_t dummyMinibatchSize = 0; - this->LoadCheckPointInfo(i, - /*out*/ totalSamplesSeen, - /*out*/ learnRatePerSample, - smoothedGradients, - /*out*/ prevCriterion, - /*out*/ dummyMinibatchSize); - - if (!m_keepCheckPointFiles) - _wunlink(GetCheckPointFileNameForEpoch(i - 1).c_str()); //delete previous checkpiont file to save space - - if (learnRatePerSample < 1e-12) - fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n", learnRatePerSample); - } - } - - void TrainEncoderDecoderModel(int startEpoch, vector nets, - vector*> trainDataReader, - vector*> validationDataReader) - { - size_t iNumNetworks = nets.size(); - vector*> featureNodes; - vector*> outputNodes; - vector*> pairNodes; - vector*> labelNodes; - vector*> criterionNodes; - vector*> evaluationNodes; - vector*>*> inputMatrices; - - for (size_t i = 0; i < iNumNetworks; i++) - { - auto* featPtr = &nets[i]->FeatureNodes(); - auto* lablPtr = &nets[i]->LabelNodes(); - featureNodes.push_back(featPtr); - outputNodes.push_back(&nets[i]->OutputNodes()); - pairNodes.push_back(&nets[i]->PairNodes()); - - labelNodes.push_back(lablPtr); - criterionNodes.push_back(&GetTrainCriterionNodes(nets[i])); - evaluationNodes.push_back(&GetEvalCriterionNodes(nets[i])); - - std::map*>* matrices; - matrices = new std::map*>(); - - for (size_t j = 0; j < featPtr->size(); j++) - { - (*matrices)[(*featPtr)[j]->NodeName()] = - &(dynamic_pointer_cast>((*featPtr)[j])->Value()); - } - - for (size_t j = 0; j < lablPtr->size(); j++) - { - (*matrices)[(*lablPtr)[j]->NodeName()] = - &(dynamic_pointer_cast>((*lablPtr)[j])->Value()); - } - inputMatrices.push_back(matrices); - } - - //initializing weights and gradient holder - std::list learnableNodes; - for (size_t i = 0; i < iNumNetworks; i++) - { - if (criterionNodes[i]->size() == 0) - { - for (auto ptr = evaluationNodes[i]->begin(); ptr != evaluationNodes[i]->end(); ptr++) - { - ComputationNodeBasePtr pptr = *ptr; - - const std::list& eachLearnableNodes = nets[i]->LearnableParameterNodes(pptr); //only one criterion so far TODO: support multiple ones? - for (auto nodeIter = eachLearnableNodes.begin(); nodeIter != eachLearnableNodes.end(); nodeIter++) - { - ComputationNodeBasePtr node = *nodeIter; - learnableNodes.push_back(node); - } - } - } - else - { - for (auto ptr = criterionNodes[i]->begin(); ptr != criterionNodes[i]->end(); ptr++) - { - ComputationNodeBasePtr pptr = *ptr; - - const std::list& eachLearnableNodes = nets[i]->LearnableParameterNodes(pptr); //only one criterion so far TODO: support multiple ones? - for (auto nodeIter = eachLearnableNodes.begin(); nodeIter != eachLearnableNodes.end(); nodeIter++) - { - ComputationNodeBasePtr node = *nodeIter; - learnableNodes.push_back(node); - } - } - } - - //for (auto ptr = pairNodes[i]->begin(); ptr != pairNodes[i]->end(); ptr++) - // nets[i]->BuildAndValidateSubNetwork(*ptr); - } - - std::list> smoothedGradients; -#if 0 // No longer functional due to lack of GetNumCols(). - for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++) - { - ComputationNodePtr node = dynamic_pointer_cast>(*nodeIter); - smoothedGradients.push_back(Matrix(node->GetNumRows(), node->GetNumCols(), node->Value().GetDeviceId())); - } -#endif - - double epochCriterion, avgCriterion, prevCriterion; - epochCriterion = std::numeric_limits::infinity(); - avgCriterion = prevCriterion = std::numeric_limits::infinity(); - - size_t epochsNotCountedInAvgCriterion = startEpoch % m_learnRateAdjustInterval; - - size_t iNumEvaluations = 0; - for (size_t i = 0; i < iNumNetworks; i++) - { - iNumEvaluations += evaluationNodes[i]->size(); - } - std::vector epochEvalErrors(iNumEvaluations, std::numeric_limits::infinity()); - - std::vector evalNodeNames; - for (size_t k = 0; k < iNumNetworks; k++) - { - for (auto ptr = evaluationNodes[k]->begin(); ptr != evaluationNodes[k]->end(); ptr++) - evalNodeNames.push_back((*ptr)->NodeName()); - } - - size_t totalSamplesSeen = 0; - double learnRatePerSample = 0.5f / m_mbSize[startEpoch]; - - int m_numPrevLearnRates = 5; //used to control the upper learnining rate in LR search to reduce computation - vector prevLearnRates; - prevLearnRates.resize(m_numPrevLearnRates); - for (int i = 0; i < m_numPrevLearnRates; i++) - prevLearnRates[i] = std::numeric_limits::infinity(); - - //precompute mean and invStdDev nodes and save initial model - if ( /// to-do doesn't support pre-compute such as MVN here - /// PreCompute(net, encoderTrainSetDataReader, encoderFeatureNodes, encoderlabelNodes, encoderInputMatrices) || - startEpoch == 0) - { - for (size_t k = 0; k < iNumNetworks; k++) - { - wstring tmpstr = msra::strfun::wstrprintf(L".%d", k); - nets[k]->Save(GetModelNameForEpoch(int(startEpoch) - 1, false, tmpstr)); - } - } - - bool learnRateInitialized = false; - if (startEpoch > 0) - { - size_t dummyMinibatchSize = 0; - this->LoadCheckPointInfo(startEpoch - 1, - /*out*/ totalSamplesSeen, - /*out*/ learnRatePerSample, - smoothedGradients, - /*out*/ prevCriterion, - /*out*/ dummyMinibatchSize); - } - - if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && !learnRateInitialized && m_learningRatesParam.size() <= startEpoch) - InvalidArgument("When using \"AdjustAfterEpoch\", there must either exist a checkpoint file, or an explicit learning rate must be specified in config for the starting epoch."); - - ULONG dropOutSeed = 1; - double prevDropoutRate = 0; - - bool learnRateReduced = false; - - for (int i = int(startEpoch); i < int(m_maxEpochs); i++) - { - auto t_start_epoch = clock(); - - //set dropout rate - for (size_t k = 0; k < iNumNetworks; k++) - { - if (evaluationNodes[k]->size() > 0) - ComputationNetwork::SetDropoutRate(nets[k], (*evaluationNodes[k])[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed); - if (criterionNodes[k]->size() > 0) - ComputationNetwork::SetDropoutRate(nets[k], (*criterionNodes[k])[0], m_dropoutRates[i], prevDropoutRate, dropOutSeed); - } - - //learning rate adjustment - if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::None || (m_learningRatesParam.size() > 0 && m_learningRatesParam.size() > i)) - { - learnRatePerSample = GetLearningRatePerSample(i /*BUGBUG workaround:*/, trainDataReader[0]->GetNumParallelSequences()); - } - else if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::SearchBeforeEpoch) - { - NOT_IMPLEMENTED; - } - - learnRateInitialized = true; - - if (learnRatePerSample < m_minLearnRate) - { - fprintf(stderr, "Learn Rate Per Sample for Epoch[%d] = %.8g is less than minLearnRate %.8g. Training stops.\n", i + 1, learnRatePerSample, m_minLearnRate); - break; - } - - TrainOneEpochEncoderDecoderWithHiddenStates(i, m_epochSize, nets, - trainDataReader, - featureNodes, - pairNodes, - evaluationNodes, - inputMatrices, - labelNodes, - criterionNodes, - learnableNodes, - learnRatePerSample, - smoothedGradients, - epochCriterion, epochEvalErrors, totalSamplesSeen); - - auto t_end_epoch = clock(); - double epochTime = 1.0 * (t_end_epoch - t_start_epoch) / (CLOCKS_PER_SEC); - - /** - this is hacky. Only allow evaluatio on the first encoder->decoder pair - */ - size_t decoderIdx = iNumNetworks - 1; - IDataReader* decoderValidationSetDataReader = validationDataReader[decoderIdx]; - IDataReader* decoderTrainSetDataReader = trainDataReader[decoderIdx]; - ComputationNetworkPtr decoderNet = nets[decoderIdx]; - - fprintf(stderr, "Finished Epoch[%d]: [Training Set] Decoder Train Loss Per Sample = %.8g ", i + 1, epochCriterion); - if (epochEvalErrors.size() == 1) - { - fprintf(stderr, "EvalErr Per Sample = %.8g Ave Learn Rate Per Sample = %.10g Epoch Time=%.8g\n", epochEvalErrors[0], learnRatePerSample, epochTime); - } - else - { - fprintf(stderr, "EvalErr Per Sample "); - for (size_t j = 0; j < epochEvalErrors.size(); j++) - fprintf(stderr, "[%lu]=%.8g ", j, epochEvalErrors[j]); - fprintf(stderr, "Ave Learn Rate Per Sample = %.10g Epoch Time=%.8g\n", learnRatePerSample, epochTime); - fprintf(stderr, "Finished Epoch[%d]: Criterion Node Per Sample = %.8g\n", i + 1, epochCriterion); - for (size_t j = 0; j < epochEvalErrors.size(); j++) - fprintf(stderr, "Finished Epoch[%d]: Evaluation Node [%ls] Per Sample = %.8g\n", i + 1, evalNodeNames[j].c_str(), epochEvalErrors[j]); - } - - if (decoderValidationSetDataReader != decoderTrainSetDataReader && decoderValidationSetDataReader != nullptr) - { - MultiNetworksEvaluator evalforvalidation(decoderNet); - - double vScore = evalforvalidation.EvaluateEncoderDecoderWithHiddenStates( - nets, - validationDataReader, - m_mbSize[i]); - - fprintf(stderr, "Finished Epoch[%d]: [Validation Set] Loss Per Sample = %.8g \n ", i + 1, vScore); - - epochCriterion = vScore; - } - - bool loadedPrevModel = false; - size_t epochsSinceLastLearnRateAdjust = i % m_learnRateAdjustInterval + 1; - if (avgCriterion == std::numeric_limits::infinity()) - avgCriterion = epochCriterion; - else - avgCriterion = ((epochsSinceLastLearnRateAdjust - 1 - epochsNotCountedInAvgCriterion) * avgCriterion + epochCriterion) / (epochsSinceLastLearnRateAdjust - epochsNotCountedInAvgCriterion); - - if (m_autoLearnRateSearchType == LearningRateSearchAlgorithm::AdjustAfterEpoch && m_learningRatesParam.size() <= i && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval) - { - if (prevCriterion - avgCriterion < 0 && prevCriterion != std::numeric_limits::infinity()) - { - if (m_loadBestModel) - { - //persist model and check-point info - for (size_t k = 0; k < iNumNetworks; k++) - { - nets[k]->RereadPersistableParameters(GetModelNameForEpoch(i, false, msra::strfun::wstrprintf(L".%d", k))); - nets[k]->ResetEvalTimeStamps(); - } - - size_t dummyLr = 0; - this->LoadCheckPointInfo(i - 1, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, dummyLr); - fprintf(stderr, "Loaded the previous model which has better training criterion.\n"); - loadedPrevModel = true; - } - } - - if (m_continueReduce) - { - if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits::infinity()) - { - if (learnRateReduced == false) - { - learnRateReduced = true; - } - else - { - //persist model and check-point info - for (size_t k = 0; k < iNumNetworks; k++) - { - nets[k]->Save(GetModelNameForEpoch(i, true, msra::strfun::wstrprintf(L".%d", k))); - } - fprintf(stderr, "Finished training and saved final model\n\n"); - break; - } - } - if (learnRateReduced) - { - learnRatePerSample *= m_learnRateDecreaseFactor; - fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample); - } - } - else - { - if (prevCriterion - avgCriterion <= m_reduceLearnRateIfImproveLessThan * prevCriterion && prevCriterion != std::numeric_limits::infinity()) - { - - learnRatePerSample *= m_learnRateDecreaseFactor; - fprintf(stderr, "learnRatePerSample reduced to %.8g\n", learnRatePerSample); - } - else if (prevCriterion - avgCriterion > m_increaseLearnRateIfImproveMoreThan * prevCriterion && prevCriterion != std::numeric_limits::infinity()) - { - learnRatePerSample *= m_learnRateIncreaseFactor; - fprintf(stderr, "learnRatePerSample increased to %.8g\n", learnRatePerSample); - } - } - } - - if (!loadedPrevModel && epochsSinceLastLearnRateAdjust == m_learnRateAdjustInterval) //not loading previous values then set them - { - prevCriterion = avgCriterion; - epochsNotCountedInAvgCriterion = 0; - } - - //persist model and check-point info - for (size_t k = 0; k < iNumNetworks; k++) - { - nets[k]->Save(GetModelNameForEpoch(i, false, msra::strfun::wstrprintf(L".%d", k))); - } - - this->SaveCheckPointInfo(i, totalSamplesSeen, learnRatePerSample, smoothedGradients, prevCriterion, 0); - if (!m_keepCheckPointFiles) - _wunlink(GetCheckPointFileNameForEpoch(i - 1).c_str()); //delete previous checkpiont file to save space - - if (learnRatePerSample < 1e-12) - fprintf(stderr, "learnRate per sample is reduced to %.8g which is below 1e-12. stop training.\n", learnRatePerSample); - } - - for (size_t i = 0; i < iNumNetworks; i++) - { - delete inputMatrices[i]; - } - } - - /// use hidden states between encoder and decoder to communicate between two networks - void TrainOneEpochEncoderDecoderWithHiddenStates( - const int epochNumber, - const size_t epochSize, - vector nets, /// encoder network - vector*> dataReader, - vector*> featureNodes, - vector*> pairNodes, - vector*> evaluationNodes, - vector*>*> inputMatrices, - vector*> labelNodes, - vector*> criterionNodes, - const std::list& learnableNodes, - const double learnRatePerSample, - std::list>& smoothedGradients, - double& epochCriterion, std::vector& epochEvalErrors, size_t& totalSamplesSeen) - { - ComputationNetworkPtr encoderNet = nets[0]; - ComputationNetworkPtr decoderNet = nets[1]; - DEVICEID_TYPE device = encoderNet->GetDeviceId(); - Matrix historyMat(device); - - double readTimeInMBs = 0, ComputeTimeInMBs = 0; - double epochCriterionLastMBs = 0; - - int numSamplesLastMBs = 0; - std::vector epochEvalErrorsLastMBs(epochEvalErrors.size(), 0); - - clock_t startReadMBTime = 0, startComputeMBTime = 0; - clock_t endReadMBTime = 0, endComputeMBTime = 0; - - //initialize statistics - size_t totalEpochSamples = 0; - - int numMBsRun = 0; - - size_t numEvalNodes = epochEvalErrors.size(); - - // NOTE: the following two local matrices are not used in PTask path - Matrix localEpochCriterion(1, 2, decoderNet->GetDeviceId()); //assume only one training criterion node for each epoch - Matrix localEpochEvalErrors(1, numEvalNodes, decoderNet->GetDeviceId()); - - localEpochCriterion.SetValue(0); - localEpochEvalErrors.SetValue(0); - - for (auto ptr = dataReader.begin(); ptr != dataReader.end(); ptr++) - { - (*ptr)->StartMinibatchLoop(m_mbSize[epochNumber], epochNumber, m_epochSize); - } - - startReadMBTime = clock(); - - size_t iNumNetworks = nets.size(); - - unsigned uSeedForDataReader = epochNumber; - - bool bContinueDecoding = true; - while (bContinueDecoding) - { - size_t i = 0; - for (auto ptr = dataReader.begin(); ptr != dataReader.end(); ptr++, i++) - { - IDataReader* pptr = (*ptr); - pptr->SetRandomSeed(uSeedForDataReader); - if (i == 0) - pptr->GetMinibatch(*(inputMatrices[i])); - else if (pptr->GetMinibatch(*(inputMatrices[i])) == false) - { - bContinueDecoding = false; - break; - } - } - - if (!bContinueDecoding) - break; - - size_t actualMBSize = decoderNet->DetermineActualMBSizeFromFeatures(); - if (actualMBSize == 0) - LogicError("decoderTrainSetDataReader read data but decoderNet reports no data read"); - - for (size_t i = 0; i < iNumNetworks; i++) - { - ComputationNetwork::BumpEvalTimeStamp(*featureNodes[i]); - if (labelNodes[i]->size() > 0) - ComputationNetwork::BumpEvalTimeStamp(*labelNodes[i]); - } - - endReadMBTime = clock(); - startComputeMBTime = clock(); - - /// not the sentence begining, because the initial hidden layer activity is from the encoder network - // decoderTrainSetDataReader->SetSentenceBegin(false); - // decoderTrainSetDataReader->CopyMBLayoutTo(decoderNet->m_mbLayout.m_sentenceBoundaryFlags); - // decoderTrainSetDataReader->CopyMBLayoutTo(decoderNet->m_sentenceBegin); - - if (m_doGradientCheck) - { - if (EncoderDecoderGradientCheck(nets, - dataReader, - evaluationNodes, - pairNodes, - featureNodes, - criterionNodes, - localEpochCriterion, localEpochEvalErrors) == false) - { - RuntimeError("SGD::TrainOneEpochEncoderDecoderWithHiddenStates gradient check not passed!"); - } - localEpochCriterion.SetValue(0); - localEpochEvalErrors.SetValue(0); - } - - EncoderDecoderWithHiddenStatesForwardPass(nets, - dataReader, pairNodes, evaluationNodes, - featureNodes, criterionNodes, - localEpochCriterion, localEpochEvalErrors); - - EncoderDecoderWithHiddenStatesErrorProp(nets, pairNodes, criterionNodes); - - //update model parameters - if (learnRatePerSample > m_minLearnRate * 0.01) - { - auto smoothedGradientIter = smoothedGradients.begin(); - for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, smoothedGradientIter++) - { - ComputationNodeBasePtr node = *nodeIter; - if (node->IsParameterUpdateRequired()) - { - Matrix& smoothedGradient = (*smoothedGradientIter); - - UpdateWeights(node, smoothedGradient, learnRatePerSample, GetMomentumPerSample(epochNumber /*BUGBUG workaround:*/, dataReader[0]->GetNumParallelSequences()), actualMBSize, m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier, m_useNesterovMomentum); - } - } - } - - endComputeMBTime = clock(); - numMBsRun++; - if (m_traceLevel > 0) - { - double MBReadTime = (double) (endReadMBTime - startReadMBTime) / (CLOCKS_PER_SEC); - double MBComputeTime = (double) (endComputeMBTime - startComputeMBTime) / CLOCKS_PER_SEC; - - readTimeInMBs += MBReadTime; - ComputeTimeInMBs += MBComputeTime; - numSamplesLastMBs += int(actualMBSize); - - if (numMBsRun % m_numMBsToShowResult == 0) - { - - epochCriterion = localEpochCriterion.Get00Element(); - for (size_t i = 0; i < numEvalNodes; i++) - epochEvalErrors[i] = (const double) localEpochEvalErrors(0, i); - - double llk = (epochCriterion - epochCriterionLastMBs) / numSamplesLastMBs; - double ppl = exp(llk); - fprintf(stderr, "Epoch[%d]-Minibatch[%d-%d]: Samples Seen = %d Decoder Train Loss Per Sample = %.8g PPL = %.4e ", epochNumber + 1, numMBsRun - m_numMBsToShowResult + 1, numMBsRun, numSamplesLastMBs, - llk, ppl); - for (size_t i = 0; i < numEvalNodes; i++) - { - fprintf(stderr, "EvalErr[%lu] Per Sample = %.8g ", i, (epochEvalErrors[i] - epochEvalErrorsLastMBs[i]) / numSamplesLastMBs); - } - fprintf(stderr, "ReadData Time = %.8g Computing Time=%.8g Total Time Per Sample=%.8g\n", readTimeInMBs, ComputeTimeInMBs, (readTimeInMBs + ComputeTimeInMBs) / numSamplesLastMBs); - - //reset statistics - readTimeInMBs = ComputeTimeInMBs = 0; - numSamplesLastMBs = 0; - - epochCriterionLastMBs = epochCriterion; - for (size_t i = 0; i < numEvalNodes; i++) - epochEvalErrorsLastMBs[i] = epochEvalErrors[i]; - } - } - startReadMBTime = clock(); - totalEpochSamples += actualMBSize; - totalSamplesSeen += actualMBSize; - - if (totalEpochSamples >= epochSize) - break; - - /// call DataEnd function - /// DataEnd does reader specific process if sentence ending is reached - // encoderTrainSetDataReader->SetSentenceEnd(true); - // decoderTrainSetDataReader->SetSentenceEnd(true); - for (auto ptr = dataReader.begin(); ptr != dataReader.end(); ptr++) - { - (*ptr)->DataEnd(endDataSentence); - } - - uSeedForDataReader++; - } - - localEpochCriterion /= float(totalEpochSamples); - localEpochEvalErrors /= float(totalEpochSamples); - - epochCriterion = localEpochCriterion.Get00Element(); - for (size_t i = 0; i < numEvalNodes; i++) - { - epochEvalErrors[i] = localEpochEvalErrors(0, i); - } - fprintf(stderr, "total samples in epoch[%d] = %zd\n", epochNumber, totalEpochSamples); - } - - bool EncoderDecoderGradientCheck( - vector nets, /// encoder network - vector*> dataReader, - vector*> evaluationNodes, - vector*> pairNodes, - vector*> featureNodes, - vector*> criterionNodes, - Matrix& localEpochCriterion, - Matrix& localEpochEvalErrors) - { - size_t iNumNetworks = nets.size(); - vector verror_msgs; - DEVICEID_TYPE deviceId; - - for (int i = iNumNetworks - 1; i >= 0; i--) - { - /// check decoder learnable parameters - const std::list& learnableNodes = - (evaluationNodes[i]->size() == 0 && pairNodes[i]->size() > 0) ? nets[i]->LearnableParameterNodes((*pairNodes[i])[0]) - : nets[i]->LearnableParameterNodes((*evaluationNodes[i])[0]); - - for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++) - { - ComputationNodePtr node = dynamic_pointer_cast>(*nodeIter); - - for (size_t itry = 0; itry < min((size_t) 10, node->Value().GetNumElements()); itry++) - { - - int irow = (int) fmod(rand(), node->Value().GetNumRows() - 1); - int icol = (int) fmod(rand(), node->Value().GetNumCols() - 1); - irow = max(0, irow); - icol = max(0, icol); - - fprintf(stderr, "\n###### d%ls######\n", node->NodeName().c_str()); - deviceId = node->Value().GetDeviceId(); // original device id - - node->Value().TransferFromDeviceToDevice(deviceId, CPUDEVICE, true, false, false); - double eOrg = node->Value()(irow, icol); /// warning :: this function will put matrix into CPU - node->Value().TransferToDeviceIfNotThere(deviceId, true); - - /// perturb parameter - double ePos = eOrg + EPSILON; - node->Value().TransferFromDeviceToDevice(deviceId, CPUDEVICE, true, false, false); - node->Value().SetValue(irow, icol, (ElemType) ePos); - node->Value().TransferToDeviceIfNotThere(deviceId, true); - - node->BumpEvalTimeStamp(); - localEpochCriterion.SetValue(0); - localEpochEvalErrors.SetValue(0); - - EncoderDecoderWithHiddenStatesForwardPass(nets, - dataReader, pairNodes, evaluationNodes, - featureNodes, criterionNodes, - localEpochCriterion, localEpochEvalErrors); - - double score1 = localEpochCriterion.Get00Element(); - - double eNeg = eOrg - EPSILON; - node->Value().TransferFromDeviceToDevice(deviceId, CPUDEVICE, true, false, false); - node->Value().SetValue(irow, icol, (ElemType) eNeg); - node->Value().TransferToDeviceIfNotThere(deviceId, true); - node->BumpEvalTimeStamp(); - localEpochCriterion.SetValue(0); - localEpochEvalErrors.SetValue(0); - - EncoderDecoderWithHiddenStatesForwardPass(nets, - dataReader, pairNodes, evaluationNodes, - featureNodes, criterionNodes, - localEpochCriterion, localEpochEvalErrors); - - double score1r = localEpochCriterion.Get00Element(); - - double grdNum = (score1r - score1) / (eNeg - ePos); - - node->Value().TransferFromDeviceToDevice(deviceId, CPUDEVICE, true, false, false); - node->Value().SetValue(irow, icol, (ElemType) eOrg); - node->Value().TransferToDeviceIfNotThere(deviceId, true); - node->BumpEvalTimeStamp(); - localEpochCriterion.SetValue(0); - localEpochEvalErrors.SetValue(0); - - EncoderDecoderWithHiddenStatesForwardPass(nets, - dataReader, pairNodes, evaluationNodes, - featureNodes, criterionNodes, - localEpochCriterion, localEpochEvalErrors); - - EncoderDecoderWithHiddenStatesErrorProp(nets, pairNodes, criterionNodes); - - node->Gradient().TransferFromDeviceToDevice(deviceId, CPUDEVICE, true, false, false); - double grdErr = node->Gradient()(irow, icol); - node->Gradient().TransferToDeviceIfNotThere(deviceId, true); - - // check if they are consistent - double threshold = pow(10.0, max(0.0, ceil(log10(min(fabs(grdErr), fabs(grdNum))))) - (int) m_gradientCheckSigDigit); - double diff = fabs(grdErr - grdNum); - bool wrong = (std::isnan(diff) || diff > threshold); - if (wrong) - { - char serr[2048]; - sprintf((char*) serr, "Decoder %ls Numeric gradient = %e, Error BP gradient = %e", node->NodeName().c_str(), static_cast(grdNum), static_cast(grdErr)); - fprintf(stdout, "%s\n", serr); - verror_msgs.push_back(serr); - } - } - } - } - - if (verror_msgs.size() > 0) - return false; - return true; - } - - void EncoderDecoderWithHiddenStatesForwardPass( - vector& nets, // TODO: should these vectors all be refs? - vector*>& dataReader, - vector*>& pairNodes, - vector*>& evaluationNodes, - vector*>& /*featureNodes*/, - vector*>& criterionNodes, - Matrix& localEpochCriterion, - Matrix& localEpochEvalErrors) - { - size_t iNumNetworks = nets.size(); - - for (size_t i = 0; i < iNumNetworks - 1; i++) - { - size_t j = i + 1; - - EncoderDecoderWithHiddenStatesForwardPass(nets[i], nets[j], - dataReader[i], dataReader[j], - *pairNodes[i], - *criterionNodes[j], - *evaluationNodes[j], - *pairNodes[j], - localEpochCriterion, localEpochEvalErrors); - } - } - - void EncoderDecoderWithHiddenStatesForwardPass( - ComputationNetworkPtr encoderNet, /// encoder network - ComputationNetworkPtr decoderNet, - IDataReader* encoderTrainSetDataReader, - IDataReader* decoderTrainSetDataReader, - vector& encoderEvaluationNodes, - vector& decoderCriterionNodes, - vector& decoderEvaluationNodes, - vector& decoderPairNodes, - Matrix& localEpochCriterion, - Matrix& localEpochEvalErrors) - { - //encoderNet->SetActualMiniBatchSizeFromFeatures(); - encoderTrainSetDataReader->CopyMBLayoutTo(encoderNet->GetMBLayoutPtr()); - encoderNet->VerifyActualNumParallelSequences(encoderTrainSetDataReader->GetNumParallelSequences()); - - encoderNet->ForwardProp(encoderEvaluationNodes[0]); - - //decoderNet->SetActualMiniBatchSizeFromFeatures(); - decoderTrainSetDataReader->CopyMBLayoutTo(decoderNet->GetMBLayoutPtr()); - decoderNet->VerifyActualNumParallelSequences(decoderTrainSetDataReader->GetNumParallelSequences()); - /// not the sentence begining, because the initial hidden layer activity is from the encoder network - - if (decoderCriterionNodes.size() == 0 && decoderEvaluationNodes.size() == 0) - { - decoderNet->ForwardProp(decoderPairNodes[0]); - } - else - { - decoderNet->ForwardProp(decoderCriterionNodes[0]); - - Matrix::AddElementToElement(dynamic_pointer_cast>(decoderCriterionNodes[0])->Value(), 0, 0, localEpochCriterion, 0, 0); - - size_t numEvalNodes = decoderEvaluationNodes.size(); - std::vector mbEvalErrors(numEvalNodes, 0); - - for (size_t i = 0; i < numEvalNodes; i++) - { - decoderNet->ForwardProp(decoderEvaluationNodes[i]); - Matrix::AddElementToElement(dynamic_pointer_cast>(decoderEvaluationNodes[i])->Value(), 0, 0, localEpochEvalErrors, 0, i); - } -#ifdef DEBUG_DECODER - fprintf(stderr, "ForwardPass score = %.8e\n", localEpochCriterion.Get00Element()); -#endif - } - } - - void EncoderDecoderWithHiddenStatesErrorProp( - vector networks, /// encoder network - vector*> pairNodes, - vector*> criterionNodes) - { - /** - the networks are organized in the forward pass - */ - size_t inetworks = networks.size(); - if (inetworks != criterionNodes.size()) - LogicError("EncoderDecoderWithHiddenStatesErrorProp: number of networks should be the same size as the number of criteron nodes."); - - for (size_t i = 0; i < pairNodes.size(); i++) - { - for (auto ptr = pairNodes[i]->begin(); ptr != pairNodes[i]->end(); ptr++) - networks[i]->ZeroGradients(*ptr); - } - - for (size_t i = 0; i < criterionNodes.size(); i++) - { - for (auto ptr = criterionNodes[i]->begin(); ptr != criterionNodes[i]->end(); ptr++) - networks[i]->ZeroGradients(*ptr); - } - - for (auto ptr = criterionNodes[inetworks - 1]->begin(); ptr != criterionNodes[inetworks - 1]->end(); ptr++) - { - if (ptr == criterionNodes[inetworks - 1]->begin()) - { - networks[inetworks - 1]->ForwardProp(*ptr); - networks[inetworks - 1]->Backprop(*ptr); - } - else - { - networks[inetworks - 1]->ForwardProp(*ptr); -#if 1 // disable this, so that we can remove the options from Backprop() (trivial to bring back if ever needed) - NOT_IMPLEMENTED; -#else - // This is the old signature of Backprop() - // void Backprop(const ComputationNodeBasePtr rootNode, bool /*bResetToOne*/, bool /*bClearGradient*/) - networks[inetworks - 1]->Backprop(*ptr, false, false); -#endif - } - } - - for (int i = inetworks - 2; i >= 0; i--) - { - if (criterionNodes[i]->size() > 0) - { - /// has criterion - /// no need to compute gradients from pairnodes, because the gradients are added from pair nodes already - for (auto ptr = criterionNodes[i]->begin(); ptr != criterionNodes[i]->end(); ptr++) - { - networks[i]->ForwardProp(*ptr); -#if 1 - NOT_IMPLEMENTED; -#else - networks[i]->Backprop(*ptr, true, false); -#endif - } - } - else if (pairNodes[i]->size() > 0) - { - /// no criterion, so use pair-node gradients - for (auto ptr = pairNodes[i]->begin(); ptr != pairNodes[i]->end(); ptr++) - { - networks[i]->ForwardProp(*ptr); -#if 1 - NOT_IMPLEMENTED; -#else - networks[i]->Backprop(*ptr, false, false); -#endif - } - } - } - } -}; -} } } diff --git a/Source/SGDLib/SGDLib.vcxproj b/Source/SGDLib/SGDLib.vcxproj index e3e31cf095b4..d3c5232f7346 100644 --- a/Source/SGDLib/SGDLib.vcxproj +++ b/Source/SGDLib/SGDLib.vcxproj @@ -178,13 +178,10 @@ - - - diff --git a/Source/SGDLib/SGDLib.vcxproj.filters b/Source/SGDLib/SGDLib.vcxproj.filters index 4710ad336f8c..f9675c72c9d7 100644 --- a/Source/SGDLib/SGDLib.vcxproj.filters +++ b/Source/SGDLib/SGDLib.vcxproj.filters @@ -126,12 +126,6 @@ Data Reading - - MultiNetworks - - - MultiNetworks - Common\Include @@ -141,9 +135,6 @@ Common\Include - - MultiNetworks - Parallelization @@ -185,8 +176,5 @@ {b866d513-7bd0-497c-98c2-f62dbcd4cde4} - - {ae1eea3c-d77f-46ec-bf4f-1cd093a295e8} - \ No newline at end of file