From 06e38a9fbdade93413f69f026ff16484c55d1682 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Wed, 9 Mar 2016 09:47:52 -0800 Subject: [PATCH 01/26] refactored node formatting from SimpleOutputWriter into ComputationNode, for uas as a debugging aid. No code change other than refactoring-related --- .../ComputationNetworkLib/ComputationNode.cpp | 103 +++++++++++++++++ .../ComputationNetworkLib/ComputationNode.h | 5 + Source/SGDLib/SimpleOutputWriter.h | 106 ++---------------- 3 files changed, 118 insertions(+), 96 deletions(-) diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp index 4c57c58380e1..d983399903bc 100644 --- a/Source/ComputationNetworkLib/ComputationNode.cpp +++ b/Source/ComputationNetworkLib/ComputationNode.cpp @@ -229,6 +229,109 @@ template } } +// write out the content of a node in formatted/readable form +template +void ComputationNode::WriteMinibatchWithFormatting(FILE* f, bool transpose, bool isCategoryLabel, const std::vector& labelMapping, + const string& sequenceSeparator, const string& sequencePrologue, const string& sequenceEpilogue, const string& elementSeparator, const string& sampleSeparator, + const string& valueFormatString) const +{ + // get it (into a flat CPU-side vector) + const Matrix& outputValues = Value(); + size_t tempArraySize = 0; + ElemType* tempArray = nullptr; + outputValues.CopyToArray(tempArray, tempArraySize); + + // process all sequences one by one + auto pMBLayout = GetMBLayout(); + if (!pMBLayout) // no MBLayout: We are printing aggregates (or LearnableParameters?) + { + pMBLayout = make_shared(); + pMBLayout->InitAsFrameMode(1); // treat this as if we have one single sample + // TODO: This can be done more efficiently, if ever needed. + } + const auto& sequences = pMBLayout->GetAllSequences(); + size_t colStride = pMBLayout->GetNumParallelSequences() * outputValues.GetNumRows(); // how to get from one column to the next + size_t width = pMBLayout->GetNumTimeSteps(); + for (size_t s = 0; s < sequences.size(); s++) + { + const auto& seqInfo = sequences[s]; + size_t tBegin = seqInfo.tBegin >= 0 ? seqInfo.tBegin : 0; + size_t tEnd = seqInfo.tEnd <= width ? seqInfo.tEnd : width; + + // current sequence is a matrix with 'colStride' beginning at the following pointer + ElemType* pCurValue = tempArray + s * outputValues.GetNumRows() + seqInfo.tBegin; + + if (s > 0) + fprintfOrDie(f, "%s", sequenceSeparator.c_str()); + fprintfOrDie(f, "%s", sequencePrologue.c_str()); + + // output it according to our format specification + let formatChar = valueFormatString.back(); + size_t dim = outputValues.GetNumRows(); + size_t T = tEnd - tBegin; + if (isCategoryLabel) + { + if (formatChar == 's') // verify label dimension + { + if (outputValues.GetNumRows() != labelMapping.size()) + InvalidArgument("write: Row dimension %d does not match number of entries %d in labelMappingFile", (int)dim, (int)labelMapping.size()); + } + // update the matrix in-place from one-hot (or max) to index + // find the max in each column + for (size_t j = 0; j < T; j++) + { + double maxPos = -1; + double maxVal = 0; + for (size_t i = 0; i < dim; i++) + { + double val = pCurValue[i + j * dim * colStride]; + if (maxPos < 0 || val >= maxVal) + { + maxPos = (double)i; + maxVal = val; + } + } + pCurValue[0 + j * colStride] = (ElemType)maxPos; // overwrite first element in-place + } + dim = 1; // ignore remaining dimensions + } + size_t iend = transpose ? dim : T; + size_t jend = transpose ? T : dim; + size_t istride = transpose ? 1 : colStride; + size_t jstride = transpose ? colStride : 1; + for (size_t j = 0; j < jend; j++) + { + if (j > 0) + fprintfOrDie(f, "%s", sampleSeparator.c_str()); + for (size_t i = 0; i < iend; i++) + { + if (i > 0) + fprintfOrDie(f, "%s", elementSeparator.c_str()); + if (formatChar == 'f') // print as real number + { + double dval = pCurValue[i * istride + j * jstride]; + fprintfOrDie(f, valueFormatString.c_str(), dval); + } + else if (formatChar == 'u') // print category as integer index + { + unsigned int uval = (unsigned int)pCurValue[i * istride + j * jstride]; + fprintfOrDie(f, valueFormatString.c_str(), uval); + } + else if (formatChar == 's') // print category as a label string + { + size_t uval = (size_t)pCurValue[i * istride + j * jstride]; + assert(uval < labelMapping.size()); + const char * sval = labelMapping[uval].c_str(); + fprintfOrDie(f, valueFormatString.c_str(), sval); + } + } + } + fprintfOrDie(f, "%s", sequenceEpilogue.c_str()); + } // end loop over sequences + + delete[] tempArray; +} + // ----------------------------------------------------------------------- // instantiate the core class templates // ----------------------------------------------------------------------- diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 30dca5b48d57..d788775c77bb 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -1495,10 +1495,15 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot // ----------------------------------------------------------------------- virtual void DumpNodeInfo(const bool /*printValues*/, const bool /*printMetadata*/, File& fstream) const; + // helper for SimpleOutWriter, living in here to be able to use in debugging + void WriteMinibatchWithFormatting(FILE* f, bool transpose, bool isCategoryLabel, const std::vector& labelMapping, + const std::string& sequenceSeparator, const std::string& sequencePrologue, const std::string& sequenceEpilogue, const std::string& elementSeparator, const std::string& sampleSeparator, + const std::string& valueFormatString) const; protected: // print node values + // This is used for dumping model parameters, not minibatch data. void PrintNodeValuesToFile(const bool printValues, const bool printMetadata, File& fstream) const { if (printValues) diff --git a/Source/SGDLib/SimpleOutputWriter.h b/Source/SGDLib/SimpleOutputWriter.h index f4a0d824043d..c98e10119adf 100644 --- a/Source/SGDLib/SimpleOutputWriter.h +++ b/Source/SGDLib/SimpleOutputWriter.h @@ -222,8 +222,6 @@ class SimpleOutputWriter size_t totalEpochSamples = 0; size_t numMBsRun = 0; - size_t tempArraySize = 0; - ElemType* tempArray = nullptr; for (auto & onode : outputNodes) { @@ -231,9 +229,6 @@ class SimpleOutputWriter fprintfOrDie(f, "%s", formattingOptions.prologue.c_str()); } - char formatChar = !formattingOptions.isCategoryLabel ? 'f' : !formattingOptions.labelMappingFile.empty() ? 's' : 'u'; - std::string valueFormatString = "%" + formattingOptions.precisionFormat + formatChar; // format string used in fprintf() for formatting the values - size_t actualMBSize; const size_t numIterationsBeforePrintingProgress = 100; size_t numItersSinceLastPrintOfProgress = 0; @@ -247,10 +242,6 @@ class SimpleOutputWriter // Note: Intermediate values are memoized, so in case of multiple output nodes, we only compute what has not been computed already. m_net->ForwardProp(onode); - // get it (into a flat CPU-side vector) - Matrix& outputValues = dynamic_pointer_cast>(onode)->Value(); - outputValues.CopyToArray(tempArray, tempArraySize); - // sequence separator FILE * f = *outputStreams[onode]; const auto sequenceSeparator = formattingOptions.Processed(onode->NodeName(), formattingOptions.sequenceSeparator); @@ -259,91 +250,16 @@ class SimpleOutputWriter const auto elementSeparator = formattingOptions.Processed(onode->NodeName(), formattingOptions.elementSeparator); const auto sampleSeparator = formattingOptions.Processed(onode->NodeName(), formattingOptions.sampleSeparator); - // process all sequences one by one - auto pMBLayout = onode->GetMBLayout(); - if (!pMBLayout) // no MBLayout: We are printing aggregates (or LearnableParameters?) - { - pMBLayout = make_shared(); - pMBLayout->InitAsFrameMode(1); // treat this as if we have one single sample - } - const auto& sequences = pMBLayout->GetAllSequences(); - size_t colStride = pMBLayout->GetNumParallelSequences() * outputValues.GetNumRows(); // how to get from one column to the next - size_t width = pMBLayout->GetNumTimeSteps(); - for (size_t s = 0; s < sequences.size(); s++) - { - const auto& seqInfo = sequences[s]; - size_t tBegin = seqInfo.tBegin >= 0 ? seqInfo.tBegin : 0; - size_t tEnd = seqInfo.tEnd <= width ? seqInfo.tEnd : width; - - // current sequence is a matrix with 'colStride' beginning at the following pointer - ElemType* pCurValue = tempArray + s * outputValues.GetNumRows() + seqInfo.tBegin; - - if ((numMBsRun > 0 || s > 0) && !sequenceSeparator.empty()) - fprintfOrDie(f, "%s", sequenceSeparator.c_str()); - fprintfOrDie(f, "%s", sequencePrologue.c_str()); - - // output it according to our format specification - size_t dim = outputValues.GetNumRows(); - size_t T = tEnd - tBegin; - if (formattingOptions.isCategoryLabel) - { - if (formatChar == 's') // verify label dimension - { - if (outputValues.GetNumRows() != labelMapping.size()) - InvalidArgument("write: Row dimension %d does not match number of entries %d in labelMappingFile '%ls'", (int)dim, (int)labelMapping.size(), formattingOptions.labelMappingFile.c_str()); - } - // update the matrix in-place from one-hot (or max) to index - // find the max in each column - for (size_t j = 0; j < T; j++) - { - double maxPos = -1; - double maxVal = 0; - for (size_t i = 0; i < dim; i++) - { - double val = pCurValue[i + j * dim * colStride]; - if (maxPos < 0 || val >= maxVal) - { - maxPos = (double)i; - maxVal = val; - } - } - pCurValue[0 + j * colStride] = (ElemType)maxPos; // overwrite first element in-place - } - dim = 1; // ignore remaining dimensions - } - size_t iend = formattingOptions.transpose ? dim : T; - size_t jend = formattingOptions.transpose ? T : dim; - size_t istride = formattingOptions.transpose ? 1 : colStride; - size_t jstride = formattingOptions.transpose ? colStride : 1; - for (size_t j = 0; j < jend; j++) - { - if (j > 0) - fprintfOrDie(f, "%s", sampleSeparator.c_str()); - for (size_t i = 0; i < iend; i++) - { - if (i > 0) - fprintfOrDie(f, "%s", elementSeparator.c_str()); - if (formatChar == 'f') // print as real number - { - double dval = pCurValue[i * istride + j * jstride]; - fprintfOrDie(f, valueFormatString.c_str(), dval); - } - else if (formatChar == 'u') // print category as integer index - { - unsigned int uval = (unsigned int) pCurValue[i * istride + j * jstride]; - fprintfOrDie(f, valueFormatString.c_str(), uval); - } - else if (formatChar == 's') // print category as a label string - { - size_t uval = (size_t) pCurValue[i * istride + j * jstride]; - assert(uval < labelMapping.size()); - const char * sval = labelMapping[uval].c_str(); - fprintfOrDie(f, valueFormatString.c_str(), sval); - } - } - } - fprintfOrDie(f, "%s", sequenceEpilogue.c_str()); - } // end loop over sequences + char formatChar = !formattingOptions.isCategoryLabel ? 'f' : !formattingOptions.labelMappingFile.empty() ? 's' : 'u'; + std::string valueFormatString = "%" + formattingOptions.precisionFormat + formatChar; // format string used in fprintf() for formatting the values + + if (numMBsRun > 0) // WriteMinibatchWithFormatting() will not include this before first sequence + fprintfOrDie(f, "%s", sequenceSeparator.c_str()); + + auto pnode = dynamic_pointer_cast>(onode); + pnode->WriteMinibatchWithFormatting(f, formattingOptions.transpose, formattingOptions.isCategoryLabel, labelMapping, + sequenceSeparator, sequencePrologue, sequenceEpilogue, elementSeparator, sampleSeparator, + valueFormatString); } // end loop over nodes totalEpochSamples += actualMBSize; @@ -372,8 +288,6 @@ class SimpleOutputWriter fprintfOrDie(f, "%s", formattingOptions.epilogue.c_str()); } - delete[] tempArray; - fprintf(stderr, "Written to %ls*\nTotal Samples Evaluated = %lu\n", outputPath.c_str(), totalEpochSamples); // flush all files (where we can catch errors) so that we can then destruct the handle cleanly without error From ba2238d2159e1cedc118e932255d565246668d70 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Wed, 9 Mar 2016 12:55:37 -0800 Subject: [PATCH 02/26] added an SGD option for basic tracing that to dump minibatches computed by nodes, using the same code as SimpleOutputWriter --- .../ComputationNetworkLib/ComputationNode.cpp | 24 ++++++++++++++----- .../ComputationNetworkLib/ComputationNode.h | 22 ++++++++++++++++- Source/SGDLib/SGD.cpp | 9 ++++++- Source/SGDLib/SGD.h | 8 ++++++- Source/SGDLib/SimpleOutputWriter.h | 2 +- 5 files changed, 55 insertions(+), 10 deletions(-) diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp index d983399903bc..d67e2be995b9 100644 --- a/Source/ComputationNetworkLib/ComputationNode.cpp +++ b/Source/ComputationNetworkLib/ComputationNode.cpp @@ -231,7 +231,7 @@ template // write out the content of a node in formatted/readable form template -void ComputationNode::WriteMinibatchWithFormatting(FILE* f, bool transpose, bool isCategoryLabel, const std::vector& labelMapping, +void ComputationNode::WriteMinibatchWithFormatting(FILE* f, size_t onlyUpToRow, size_t onlyUpToT, bool transpose, bool isCategoryLabel, const std::vector& labelMapping, const string& sequenceSeparator, const string& sequencePrologue, const string& sequenceEpilogue, const string& elementSeparator, const string& sampleSeparator, const string& valueFormatString) const { @@ -295,19 +295,31 @@ void ComputationNode::WriteMinibatchWithFormatting(FILE* f, bool trans } dim = 1; // ignore remaining dimensions } - size_t iend = transpose ? dim : T; - size_t jend = transpose ? T : dim; - size_t istride = transpose ? 1 : colStride; - size_t jstride = transpose ? colStride : 1; + let iend = transpose ? dim : T; // true dimension of the data to print + let jend = transpose ? T : dim; + let istop = transpose ? onlyUpToRow : onlyUpToT; // we stop at these dimensions (for debugging, one often needs only the first few values of those huge matrices) + let jstop = transpose ? onlyUpToT : onlyUpToRow; + let istride = transpose ? 1 : colStride; + let jstride = transpose ? colStride : 1; for (size_t j = 0; j < jend; j++) { if (j > 0) fprintfOrDie(f, "%s", sampleSeparator.c_str()); + if (j == jstop) + { + fprintf(f, "..."); // 'nuff said + break; + } for (size_t i = 0; i < iend; i++) { if (i > 0) fprintfOrDie(f, "%s", elementSeparator.c_str()); - if (formatChar == 'f') // print as real number + if (i == istop) + { + fprintf(f, "..."); + break; + } + else if (formatChar == 'f') // print as real number { double dval = pCurValue[i * istride + j * jstride]; fprintfOrDie(f, valueFormatString.c_str(), dval); diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index d788775c77bb..86d310c0379e 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -162,6 +162,15 @@ struct ComputationNetworkOwnedNodeState virtual void MarkValueSharable() { m_valueSharable = true; } bool IsValueSharable() const { return m_valueSharable; } + // tracing flags + // Enable to print the value of the function-value matrix in somewhat readable format. + // These are public since you are meant to set these flags manually in the debugger or temporarily poke into them from code as needed. + bool m_traceNodeValue = false; + bool m_traceNodeValueAsCategoryLabel = false; + size_t m_traceNodeValueUpToDim = 5; + size_t m_traceNodeValueUpToT = 5; + void EnableNodeTracing(bool isCategoryLabel) { m_traceNodeValue = true; m_traceNodeValueAsCategoryLabel = isCategoryLabel; } + protected: // TODO: should be fully encapsulated here bool m_needsGradient; // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree) @@ -1306,6 +1315,7 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot Value().Print(msra::strfun::utf8(NodeName()), 0, min(Value().GetNumRows()-1, 4), 0, min(Value().GetNumCols()-1, 4)); #endif InvalidateMissingValueColumns(FrameRange(m_pMBLayout)); // blast NaNs into columns that are gaps in a packed layout + Trace(); } #endif @@ -1496,9 +1506,19 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot virtual void DumpNodeInfo(const bool /*printValues*/, const bool /*printMetadata*/, File& fstream) const; // helper for SimpleOutWriter, living in here to be able to use in debugging - void WriteMinibatchWithFormatting(FILE* f, bool transpose, bool isCategoryLabel, const std::vector& labelMapping, + void WriteMinibatchWithFormatting(FILE* f, size_t onlyUpToRow, size_t onlyUpToT, bool transpose, bool isCategoryLabel, const std::vector& labelMapping, const std::string& sequenceSeparator, const std::string& sequencePrologue, const std::string& sequenceEpilogue, const std::string& elementSeparator, const std::string& sampleSeparator, const std::string& valueFormatString) const; + void Trace() + { + if (m_traceNodeValue) + { + fprintf(stderr, "Trace --> %ls = %ls -> [%s%s]\n", NodeName().c_str(), OperationName().c_str(), string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : ""); + WriteMinibatchWithFormatting(stderr, m_traceNodeValueUpToDim, m_traceNodeValueUpToT, true/*transpose*/, m_traceNodeValueAsCategoryLabel, std::vector(), + ""/*sequenceSeparator*/, " "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n "/*sampleSeparator*/, + "%13.10f"/*valueFormatString*/); + } + } protected: diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp index 0c0ae20f1eb8..7c725a1e20ca 100644 --- a/Source/SGDLib/SGD.cpp +++ b/Source/SGDLib/SGD.cpp @@ -71,6 +71,12 @@ void SGD::Train(function createN startEpoch = max(startEpoch, 0); m_needAdaptRegularization = false; + // set tracing flags + for (const auto& traceNodeName : m_traceNodeNamesReal) + net->GetNodeFromName(traceNodeName)->EnableNodeTracing(false); + for (const auto& traceNodeName : m_traceNodeNamesCategory) + net->GetNodeFromName(traceNodeName)->EnableNodeTracing(true); + TrainOrAdaptModel(startEpoch, net, loadNetworkFromCheckpoint, net, nullptr, trainSetDataReader, validationSetDataReader); } @@ -2585,4 +2591,5 @@ SGDParams::SGDParams(const ScriptableObjects::IConfigRecordPtr configp) // register SGD<> with the ScriptableObject system ScriptableObjects::ConfigurableRuntimeTypeRegister::AddFloatDouble, SGD> registerSGDOptimizer(L"SGDOptimizer"); -} } } + +}}} diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h index 29035afe43d5..70f0889a0114 100644 --- a/Source/SGDLib/SGD.h +++ b/Source/SGDLib/SGD.h @@ -288,6 +288,8 @@ class SGD : public SGDParams // m_validateAfterModelReloading(configSGD(L"validateAfterModelReloading", true)), m_trainCriterionNodeName((const wstring&) configSGD(L"trainCriterionNodeName", L"")), m_evalCriterionNodeName((const wstring&) configSGD(L"evalCriterionNodeName", L"")), + m_traceNodeNamesReal(configSGD(L"traceNodeNamesReal", ConfigRecordType::Array(stringargvector()))), + m_traceNodeNamesCategory(configSGD(L"traceNodeNamesCategory", ConfigRecordType::Array(stringargvector()))), m_prevChosenMinibatchSize(0), m_lastFinishedEpochTrainLoss(0.0), m_distGradAgg(nullptr), @@ -504,6 +506,9 @@ class SGD : public SGDParams wstring m_trainCriterionNodeName; wstring m_evalCriterionNodeName; + // enable tracing. Nodes listed here get their m_traceNodeValue and m_traceNodeValueAsCategoryLabel flags set + vector m_traceNodeNamesReal, m_traceNodeNamesCategory; + size_t m_prevChosenMinibatchSize; double m_lastFinishedEpochTrainLoss; @@ -515,4 +520,5 @@ class SGD : public SGDParams private: int SGDTrace(FILE* __restrict __stream, const char* __restrict __format, ...); }; -} } } + +}}} diff --git a/Source/SGDLib/SimpleOutputWriter.h b/Source/SGDLib/SimpleOutputWriter.h index c98e10119adf..e6a8b057287b 100644 --- a/Source/SGDLib/SimpleOutputWriter.h +++ b/Source/SGDLib/SimpleOutputWriter.h @@ -257,7 +257,7 @@ class SimpleOutputWriter fprintfOrDie(f, "%s", sequenceSeparator.c_str()); auto pnode = dynamic_pointer_cast>(onode); - pnode->WriteMinibatchWithFormatting(f, formattingOptions.transpose, formattingOptions.isCategoryLabel, labelMapping, + pnode->WriteMinibatchWithFormatting(f, SIZE_MAX, SIZE_MAX, formattingOptions.transpose, formattingOptions.isCategoryLabel, labelMapping, sequenceSeparator, sequencePrologue, sequenceEpilogue, elementSeparator, sampleSeparator, valueFormatString); } // end loop over nodes From 4209d9df1008e4030abd128e78a7bed3b705a332 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Wed, 9 Mar 2016 22:22:24 -0800 Subject: [PATCH 03/26] bug fix: LMSequenceReader randomization must be deterministic (seed = epoch) --- .../CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs | 4 +- Source/Common/Include/Config.h | 18 +++--- .../ComputationNetwork.cpp | 12 +++- .../ComputationNetwork.h | 1 + .../ComputationNetworkEvaluation.cpp | 62 +++++++++++-------- .../ComputationNetworkLib/ComputationNode.cpp | 2 +- .../ComputationNetworkLib/ComputationNode.h | 9 +-- Source/ComputationNetworkLib/RecurrentNodes.h | 3 +- .../LMSequenceReader/SequenceReader.cpp | 11 ++-- .../Readers/LMSequenceReader/SequenceReader.h | 6 +- 10 files changed, 75 insertions(+), 53 deletions(-) diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs index e3dbfe8d5759..c0757963f158 100644 --- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs +++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs @@ -268,8 +268,8 @@ Parameters = Stabilize (x, enabled=true) = if enabled then [ - beta = Exp(ScalarParam()) - result = Scale(beta, x) + beta = Exp (ScalarParam()) + result = Scale (beta, x) ].result else x ] diff --git a/Source/Common/Include/Config.h b/Source/Common/Include/Config.h index 981fc9c5695c..8d43cbc88e70 100644 --- a/Source/Common/Include/Config.h +++ b/Source/Common/Include/Config.h @@ -18,14 +18,14 @@ using namespace std; namespace Microsoft { namespace MSR { namespace CNTK { #define FUNCTIONOPEN "(" -#define OPENBRACES "[{(\"" -#define CLOSINGBRACES "]})\"" +#define OPENBRACES "[{(\"" // all opening braces +#define CLOSINGBRACES "]})\"" // and matching closing ones static const std::string::size_type npos = (std::string::size_type) -1; // These are the constants associated with the "ResolveVariables" method. -static const char* openBraceVar = "$"; -static const char* closingBraceVar = "$"; +static const char* openBraceVar = "$"; // beginning of a var +static const char* closingBraceVar = "$"; // end of a var static const char* forbiddenCharactersInVarName = ",/<>?;':\"[]{}\\|!@#%^&*()+=~` \t\n"; static const char* forbiddenCharactersInVarNameEscapeWhitespace = ",/<>?;':\"[]{}\\|!@#%^&*()+=~` \\t\\n"; static const std::size_t openBraceVarSize = strlen(openBraceVar); @@ -357,23 +357,19 @@ class ConfigParser // str - string to search // tokenStart - start location in the string to search // returns: character position of matching closing brace, string::npos if no brace present at start position - // BUGBUG: This seems to only work for one kind of braces at a time. Nested other braces are not - // understood. Also, braces in strings are not protected. [fseide] - static std::string::size_type FindBraces(const std::string& str, std::string::size_type tokenStart) + static size_t FindBraces(const std::string& str, const size_t tokenStart) { const auto len = str.length(); // start is outside (or rather, at end of string): no brace here if (tokenStart >= len) - { return npos; - } // open braces and quote - static const std::string openBraces = OPENBRACES; + static const std::string openBraces = OPENBRACES; // currently "[{(\"" // close braces and quote static const std::string closingBraces = CLOSINGBRACES; - const auto charsToLookFor = closingBraces + openBraces; // all chars we match for + static const auto charsToLookFor = closingBraces + openBraces; // all chars we match for // get brace index for first character of input string const auto braceFound = openBraces.find(str[tokenStart]); diff --git a/Source/ComputationNetworkLib/ComputationNetwork.cpp b/Source/ComputationNetworkLib/ComputationNetwork.cpp index 7021f6f66275..c0f8d5e3464c 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.cpp +++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp @@ -218,7 +218,17 @@ void ComputationNetwork::ReadPersistableParameters(File& fstream, bool create) if (create) // loaded from scratch AddNodeToNet(node); else // reloaded existing - node->Validate(true); // nothing that propagates should have changed --TODO: have a more rigid mechanism to prevent resizing; this should only reload the model parameters + { + let old = node->GetSampleLayout(); + let changed = ValidateNode(node, /*isFinalValidationPass=*/true); + if (changed) + { + let upd = node->GetSampleLayout(); + fprintf(stderr, "ValidateSubNetwork: %ls %ls operation changed, from [%s] to [%s].", node->NodeName().c_str(), node->OperationName().c_str(), + string(old).c_str(), string(upd).c_str()); + //LogicError("ValidateSubNetwork: %ls %ls operation changed during reload or re-validation.", node->NodeName().c_str(), node->OperationName().c_str()); + } + } } fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodeList"); diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h index 117b0ffc789a..3e44b9c8a93e 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.h +++ b/Source/ComputationNetworkLib/ComputationNetwork.h @@ -165,6 +165,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb private: void ValidateNetwork(); void ValidateNodes(list nodes, bool isFinalValidationPass, size_t& todo); + bool ValidateNode(ComputationNodeBasePtr node, bool isFinalValidationPass) const; void MarkValueNonSharableNodes(); private: diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp index dc2b224ba356..b813d0ea0371 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp @@ -579,6 +579,36 @@ static pair GetDims(const ComputationNodeBasePtr& node) return make_pair(node->GetSampleLayout(), node->HasMBLayout()); } +bool ComputationNetwork::ValidateNode(ComputationNodeBasePtr node, bool isFinalValidationPass) const +{ + const auto& children = node->GetInputs(); + + // keep state + MBLayoutPtr oldMBLayoutPtr = node->GetMBLayout(); + auto dim = GetDims(node); + vector> childDims; + for (auto& child : children) + childDims.push_back(GetDims(child)); + auto sampleLayout = node->GetSampleLayout(); + // We do call validate(final) as many times as needed, since stuff may have changed underneath. + node->Validate(isFinalValidationPass /*final*/); // all nodes have been visited: do verification instead of just inference + // also take the opportunity to propagate m_needsGradient + auto needsGradient = node->m_needsGradient; + for (auto& child : children) // TODO: do we need a check that this is stable if isFinalValidationPass? + node->m_needsGradient |= child->m_needsGradient; + // check state --node will be valid if all nodes have been visited and node has not been updated + bool unchanged = true; + unchanged &= (oldMBLayoutPtr == node->GetMBLayout()); + unchanged &= (dim == GetDims(node)); + vector> newChildDims; + for (auto& child : children) + newChildDims.push_back(GetDims(child)); + unchanged &= (childDims == newChildDims); + unchanged &= (sampleLayout == node->GetSampleLayout()); + unchanged &= (needsGradient == node->m_needsGradient); + return !unchanged; +} + void ComputationNetwork::ValidateNodes(list nodes, bool isFinalValidationPass, size_t& todo) { todo = 0; // returns how many nodes are to be redone @@ -596,35 +626,15 @@ void ComputationNetwork::ValidateNodes(list nodes, bool } // if there is not at least one visited child bool valid = false; - if (hasVisitedChild || isLeaf) + if (hasVisitedChild || isLeaf) // got at least one child: it makes sense to call Validate() { - // got at least one child: it makes sense to call Validate() - // keep state - MBLayoutPtr oldMBLayoutPtr = node->GetMBLayout(); - auto dim = GetDims(node); - vector> childDims; - for (auto& child : children) - childDims.push_back(GetDims(child)); - auto sampleLayout = node->GetSampleLayout(); - // We do call validate(final) as many times as needed, since stuff may have changed underneath. + // TODO: PrintSelfBeforeValidation() into a function returning a string, and print all in a single line (also when it throws; print & rethrow). node->PrintSelfBeforeValidation(); - node->Validate(isFinalValidationPass /*final*/); // all nodes have been visited: do verification instead of just inference - fprintf(stderr, " -> [%s%s]", string(node->GetSampleLayout()).c_str(), node->HasMBLayout() ? " x *" : ""); + bool unchanged = !ValidateNode(node, isFinalValidationPass); node->m_visited = true; - // also take the opportunity to propagate m_needsGradient - auto needsGradient = node->m_needsGradient; - for (auto& child : children) // TODO: do we need a check that this is stable if isFinalValidationPass? - node->m_needsGradient |= child->m_needsGradient; - // check state --node will be valid if all nodes have been visited and node has not been updated - bool unchanged = true; - unchanged &= (oldMBLayoutPtr == node->GetMBLayout()); - unchanged &= (dim == GetDims(node)); - vector> newChildDims; - for (auto& child : children) - newChildDims.push_back(GetDims(child)); - unchanged &= (childDims == newChildDims); - unchanged &= (sampleLayout == node->GetSampleLayout()); - unchanged &= (needsGradient == node->m_needsGradient); + fprintf(stderr, "[%s%s]", string(node->GetSampleLayout()).c_str(), node->HasMBLayout() ? " x *" : ""); + // print the new type + // sanity checks if (isFinalValidationPass && !unchanged) LogicError("ValidateSubNetwork: %ls %ls operation changed during final validation.", node->NodeName().c_str(), node->OperationName().c_str()); if (isFinalValidationPass && !allChildrenVisited) diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp index d67e2be995b9..6bd6c30888cd 100644 --- a/Source/ComputationNetworkLib/ComputationNode.cpp +++ b/Source/ComputationNetworkLib/ComputationNode.cpp @@ -307,7 +307,7 @@ void ComputationNode::WriteMinibatchWithFormatting(FILE* f, size_t onl fprintfOrDie(f, "%s", sampleSeparator.c_str()); if (j == jstop) { - fprintf(f, "..."); // 'nuff said + fprintf(f, "... (%d more)", (int)(jend - jstop)); // 'nuff said break; } for (size_t i = 0; i < iend; i++) diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 86d310c0379e..474fc7e4be3c 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -167,8 +167,8 @@ struct ComputationNetworkOwnedNodeState // These are public since you are meant to set these flags manually in the debugger or temporarily poke into them from code as needed. bool m_traceNodeValue = false; bool m_traceNodeValueAsCategoryLabel = false; - size_t m_traceNodeValueUpToDim = 5; - size_t m_traceNodeValueUpToT = 5; + size_t m_traceNodeValueUpToDim = 3; // 3 should be enough to see simple patterns such as all values are identical or out of range + size_t m_traceNodeValueUpToT = 8; // 8 time steps fit comfortably into a normal-sized console void EnableNodeTracing(bool isCategoryLabel) { m_traceNodeValue = true; m_traceNodeValueAsCategoryLabel = isCategoryLabel; } protected: // TODO: should be fully encapsulated here @@ -1513,8 +1513,9 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot { if (m_traceNodeValue) { - fprintf(stderr, "Trace --> %ls = %ls -> [%s%s]\n", NodeName().c_str(), OperationName().c_str(), string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : ""); - WriteMinibatchWithFormatting(stderr, m_traceNodeValueUpToDim, m_traceNodeValueUpToT, true/*transpose*/, m_traceNodeValueAsCategoryLabel, std::vector(), + const auto shape = GetTensorShape(DetermineElementwiseTensorRank()); + fprintf(stderr, "Trace --> %ls = %ls -> [%s]\n", NodeName().c_str(), OperationName().c_str(), string(shape).c_str()); + WriteMinibatchWithFormatting(stderr, m_traceNodeValueUpToDim, m_traceNodeValueUpToT, false/*transpose*/, m_traceNodeValueAsCategoryLabel, std::vector(), ""/*sequenceSeparator*/, " "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n "/*sampleSeparator*/, "%13.10f"/*valueFormatString*/); } diff --git a/Source/ComputationNetworkLib/RecurrentNodes.h b/Source/ComputationNetworkLib/RecurrentNodes.h index d6cebbdcc95e..d9203c1ebeeb 100644 --- a/Source/ComputationNetworkLib/RecurrentNodes.h +++ b/Source/ComputationNetworkLib/RecurrentNodes.h @@ -167,7 +167,8 @@ class DelayedValueNodeBase : public ComputationNode, public IRecurrent // BUGBUG: I got an error in when reloading persistent parameterse for a model that had dimension specified as 0, which did not get re-inferred correctly. // We should either simply not write this parameter out at all (since it can always be inferred), or write the tensor shape. - SetDims(TensorShape(rows), HasMBLayout() /*may be true on reload (roll-back)*/); // tensor shape will be overwritten in Validate() --TODO: We should serialize it here. + if (GetSampleLayout().GetNumElements() != rows) // legacy format: if #rows matches then assume current tensor shape is up to date + SetDims(TensorShape(rows), HasMBLayout() /*may be true on reload (roll-back)*/); // tensor shape will be overwritten in Validate() --TODO: We should serialize it here. m_delayedValue.Resize(rows, 0); // Note: If we try to access history in first minibatch, we shall crash. It would be a consequence of a missing sentence-begin flag if (modelVersion >= CNTK_MODEL_VERSION_2) diff --git a/Source/Readers/LMSequenceReader/SequenceReader.cpp b/Source/Readers/LMSequenceReader/SequenceReader.cpp index d42831b8ecbb..838f56402fc8 100644 --- a/Source/Readers/LMSequenceReader/SequenceReader.cpp +++ b/Source/Readers/LMSequenceReader/SequenceReader.cpp @@ -1578,7 +1578,7 @@ void BatchSequenceReader::Reset() { mProcessed.clear(); mToProcess.clear(); - mLastProcssedSentenceId = 0; + mLastProcessedSentenceId = 0; mPosInSentence = 0; mLastPosInSentence = 0; mNumRead = 0; @@ -1651,6 +1651,7 @@ void BatchSequenceReader::StartMinibatchLoop(size_t mbSize, size_t epo // we use epochSize, which might not be set yet, so use a default value for allocations if not yet set size_t epochSize = m_epochSize == requestDataSize ? 1000 : m_epochSize; m_epoch = epoch; + m_randomSeed = (unsigned int)m_epoch; m_mbStartSample = epoch * m_epochSize; m_epochSamplesReturned = 0; // counter to know when we returned one epoch @@ -1700,7 +1701,7 @@ size_t BatchSequenceReader::DetermineSequencesToProcess() int mp = (int) mToProcess[s]; if (mProcessed[mp]) { - mLastProcssedSentenceId = mp; + mLastProcessedSentenceId = mp; mLastPosInSentence = 0; allDone = true; break; @@ -1722,7 +1723,7 @@ size_t BatchSequenceReader::DetermineSequencesToProcess() size_t maxToProcess = mRequestedNumParallelSequences > 0 ? mRequestedNumParallelSequences : SIZE_MAX; // if mRequestedNumParallelSequences is 0 then we go by MB size size_t maxTokens = mRequestedNumParallelSequences > 0 ? SIZE_MAX : m_mbSize; size_t numTokens = 0; // token counter - for (size_t seq = mLastProcssedSentenceId; + for (size_t seq = mLastProcessedSentenceId; seq < mNumRead && // hit end of buffer mToProcess.size() < maxToProcess; // hit parallel-sequence limit seq++) @@ -1791,14 +1792,14 @@ bool BatchSequenceReader::GetMinibatchData(size_t& /*out*/ firstPosInS #ifdef _MSC_VER // make some old configurations reproducable (m_cacheBlockSize used to be a constant) --TODO: remove in a few months if (m_cacheBlockSize == 50000) { + srand(++m_randomSeed); // TODO: older code did not have that; so no idea what random seed was used std::random_shuffle(m_parser.mSentenceIndex2SentenceInfo.begin(), m_parser.mSentenceIndex2SentenceInfo.end()); // Note: random_shuffle is deprecated since C++14. } else // new configs use a wider randomization #endif { - std::random_device rd; - std::mt19937 g(rd()); + std::mt19937 g(++m_randomSeed); // random seed is initialized to epoch, but gets incremented for intermediate reshuffles std::shuffle(m_parser.mSentenceIndex2SentenceInfo.begin(), m_parser.mSentenceIndex2SentenceInfo.end(), g); } diff --git a/Source/Readers/LMSequenceReader/SequenceReader.h b/Source/Readers/LMSequenceReader/SequenceReader.h index cac0a9bd957d..4940edcd0ddb 100644 --- a/Source/Readers/LMSequenceReader/SequenceReader.h +++ b/Source/Readers/LMSequenceReader/SequenceReader.h @@ -354,7 +354,9 @@ class BatchSequenceReader : public SequenceReader using Base::mRequestedNumParallelSequences; // IDataReader private: - size_t mLastProcssedSentenceId; + unsigned int m_randomSeed = 0; // deterministic random seed + + size_t mLastProcessedSentenceId; size_t mNumRead; // number of sentences in current cache block vector mProcessed; // [mNumRead] true if sequence has already been returned in this cache block @@ -379,7 +381,7 @@ class BatchSequenceReader : public SequenceReader BatchSequenceReader() : m_pMBLayout(make_shared()) { - mLastProcssedSentenceId = 0; + mLastProcessedSentenceId = 0; mRequestedNumParallelSequences = 1; mLastPosInSentence = 0; mNumRead = 0; From 5783c61a81d448736a509ce953ec662ce7707f32 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Thu, 10 Mar 2016 08:13:49 -0800 Subject: [PATCH 04/26] added missing optional imageLayout parameter to NDL of Reshape() --- Source/CNTK/SynchronousExecutionEngine.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Source/CNTK/SynchronousExecutionEngine.cpp b/Source/CNTK/SynchronousExecutionEngine.cpp index 01d78cb6835e..9e2431e895df 100644 --- a/Source/CNTK/SynchronousExecutionEngine.cpp +++ b/Source/CNTK/SynchronousExecutionEngine.cpp @@ -252,8 +252,9 @@ void SynchronousNodeEvaluator::Evaluate(NDLNode* node, const size_t img_width = node->GetOptionalParameter("imageWidth", "0"); size_t img_height = node->GetOptionalParameter("imageHeight", "0"); size_t img_channels = node->GetOptionalParameter("imageChannels", "0"); + ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC")); - nodePtr = builder.LegacyReshape(NULL, num_rows, ImageDimensions::AsTensorShape(img_width, img_height, img_channels, ImageLayoutKind::HWC /*legacy*/), name); // BUGBUG: use a tensor descriptor instead + nodePtr = builder.LegacyReshape(NULL, num_rows, ImageDimensions::AsTensorShape(img_width, img_height, img_channels, imageLayoutKind), name); } } else if (cnNodeType == OperationNameOf(PastValueNode) || From 7b9558a18ad37179dbc7268e75cb1dd852bc8e56 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Thu, 10 Mar 2016 08:18:25 -0800 Subject: [PATCH 05/26] (bug fix: NDL Reshape should only allow 2 positional parameters) --- Source/CNTK/SynchronousExecutionEngine.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Source/CNTK/SynchronousExecutionEngine.cpp b/Source/CNTK/SynchronousExecutionEngine.cpp index 9e2431e895df..18fe4a6c9a21 100644 --- a/Source/CNTK/SynchronousExecutionEngine.cpp +++ b/Source/CNTK/SynchronousExecutionEngine.cpp @@ -206,7 +206,7 @@ void SynchronousNodeEvaluator::Evaluate(NDLNode* node, const else if (cnNodeType == OperationNameOf(RowRepeatNode)) { if (parameter.size() != 2) - RuntimeError("RowRepeat should have two parameters. Usage: RowRepeat(origNodeName, numRepeats."); + RuntimeError("RowRepeat should have two parameters. Usage: RowRepeat(origNodeName, numRepeats)."); nodeParamCount = 1; nodeParamStart = 0; @@ -238,8 +238,8 @@ void SynchronousNodeEvaluator::Evaluate(NDLNode* node, const } else if (cnNodeType == L"Reshape" /*OperationNameOf(ReshapeNode)*/) { - if (parameter.size() < 2 || parameter.size() > 5) - RuntimeError("Reshape should have two to five parameters. Usage: Reshape(origNodeName, numRows, [imageWidth=], [imageHeight=], [imageChannels=]."); + if (parameter.size() != 2) + RuntimeError("Reshape should have two parameters. Usage: Reshape(origNodeName, numRows, [imageWidth=], [imageHeight=], [imageChannels=]."); nodeParamCount = 1; nodeParamStart = 0; From e64aec5ce27e9bda6d2a9cf1310841ffe48ed05d Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Thu, 10 Mar 2016 11:10:00 -0800 Subject: [PATCH 06/26] refined validation output --- .../lyx/CNTKBook_CNTK_Programmer_Chapter.lyx | 5 -- .../ComputationNetwork.h | 2 +- .../ComputationNetworkEvaluation.cpp | 33 +++++++--- .../ComputationNetworkLib/ComputationNode.cpp | 63 +++++++++++++++++++ .../ComputationNetworkLib/ComputationNode.h | 49 +++------------ Source/ComputationNetworkLib/ReshapingNodes.h | 22 ++----- 6 files changed, 101 insertions(+), 73 deletions(-) diff --git a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Programmer_Chapter.lyx b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Programmer_Chapter.lyx index be90521b3176..0d3994eacfe2 100644 --- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Programmer_Chapter.lyx +++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Programmer_Chapter.lyx @@ -3134,11 +3134,6 @@ virtual void Validate() { \end_layout -\begin_layout Plain Layout - - PrintSelfBeforeValidation(); -\end_layout - \begin_layout Plain Layout diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h index 3e44b9c8a93e..074061ad688c 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.h +++ b/Source/ComputationNetworkLib/ComputationNetwork.h @@ -164,7 +164,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb private: void ValidateNetwork(); - void ValidateNodes(list nodes, bool isFinalValidationPass, size_t& todo); + size_t ValidateNodes(list nodes, bool isFirstPass, bool isFinalValidationPass); bool ValidateNode(ComputationNodeBasePtr node, bool isFinalValidationPass) const; void MarkValueNonSharableNodes(); diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp index b813d0ea0371..8615dce8edd7 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp @@ -527,16 +527,16 @@ void ComputationNetwork::ValidateNetwork() // Keep going through the list until all nodes have been validated and all inputs have been validated as well. // - validate (final) // final means consistency checks // Fail if any change during this stage. - size_t pass = 0; + size_t pass = 1; size_t toValidate = nodes.size(); while (toValidate > 0) { - pass++; fprintf(stderr, "\n\nValidating network. %d nodes to process in pass %d.\n", (int) toValidate, (int) pass); - ValidateNodes(nodes, false /*isFinalValidationPass*/, toValidate); + toValidate = ValidateNodes(nodes, /*isFirstPass=*/pass == 1, false /*isFinalValidationPass*/); + pass++; } fprintf(stderr, "\n\nValidating network, final pass.\n"); - ValidateNodes(nodes, true /*isFinalValidationPass*/, toValidate); + toValidate = ValidateNodes(nodes, /*isFirstPass=*/pass == 1, true /*isFinalValidationPass*/); if (toValidate != 0) LogicError("ValidateSubNetwork: ValidateNodes(true) unexpectedly returned with work left to do."); @@ -609,9 +609,11 @@ bool ComputationNetwork::ValidateNode(ComputationNodeBasePtr node, bool isFinalV return !unchanged; } -void ComputationNetwork::ValidateNodes(list nodes, bool isFinalValidationPass, size_t& todo) +// perform one pass of validation over the topologically-sorted node set +// returns how many nodes either could not yet be validated yet or have changed and thus must be redone +size_t ComputationNetwork::ValidateNodes(list nodes, bool isFirstPass, bool isFinalValidationPass) { - todo = 0; // returns how many nodes are to be redone + size_t todo = 0; for (auto& node : nodes) { const auto& children = node->GetInputs(); @@ -628,11 +630,21 @@ void ComputationNetwork::ValidateNodes(list nodes, bool bool valid = false; if (hasVisitedChild || isLeaf) // got at least one child: it makes sense to call Validate() { - // TODO: PrintSelfBeforeValidation() into a function returning a string, and print all in a single line (also when it throws; print & rethrow). - node->PrintSelfBeforeValidation(); - bool unchanged = !ValidateNode(node, isFinalValidationPass); + string prevPrototype = node->FormatOperationPrototype(""); + bool unchanged; + try + { + unchanged = !ValidateNode(node, isFinalValidationPass); + string updatedPrototype = node->FormatOperationPrototype(""); + if (isFirstPass || !unchanged || prevPrototype != updatedPrototype) + fprintf(stderr, "Validating --> %s\n", updatedPrototype.c_str()); + } + catch (...) // if validation failed then print the prototype anyway so one can see the input args + { + fprintf(stderr, "Validating --> %s FAILED\n", prevPrototype.c_str()); + throw; + } node->m_visited = true; - fprintf(stderr, "[%s%s]", string(node->GetSampleLayout()).c_str(), node->HasMBLayout() ? " x *" : ""); // print the new type // sanity checks if (isFinalValidationPass && !unchanged) @@ -646,6 +658,7 @@ void ComputationNetwork::ValidateNodes(list nodes, bool if (!valid) todo++; } + return todo; } // ----------------------------------------------------------------------- diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp index 6bd6c30888cd..387ee95969fa 100644 --- a/Source/ComputationNetworkLib/ComputationNode.cpp +++ b/Source/ComputationNetworkLib/ComputationNode.cpp @@ -208,6 +208,69 @@ TensorShape ComputationNodeBase::GetTensorSliceFor(size_t rank, const FrameRange // others // ----------------------------------------------------------------------- +/*virtual*/ string ComputationNodeBase::FormatOperationPrototype(const string& extraArgs) const +{ + string prototype; + prototype += msra::strfun::strprintf("%ls = %ls", NodeName().c_str(), OperationName().c_str()); + + // arguments of operation + if (IsLeaf()) + prototype += "()"; + else + { + prototype += " ("; + for (size_t i = 0; i < GetNumInputs(); i++) + { + const auto& child = m_inputs[i]; + if (i > 0) + prototype += ", "; + + if (child) + prototype += msra::strfun::strprintf("%ls", child->NodeName().c_str()); + else + prototype += "NULL"; + } + prototype += extraArgs; + prototype += ")"; + } + + // type (tensor dimensions) of operation + prototype += " : "; + + if (!IsLeaf()) + { + //prototype += "("; + for (size_t i = 0; i < GetNumInputs(); i++) + { + const auto& child = m_inputs[i]; + if (i > 0) + prototype += ", "; + + if (child == nullptr) + { + prototype += "NULL"; + continue; + } + + const char* mbSizeMark = child->m_pMBLayout ? " x *" : ""; +#if 0 + if (child->m_sampleLayout.GetRank() == 3 && (child->m_sampleLayout[1] != 1 || child->m_sampleLayout[0] != 1)) // looks like an image: use WHC notation + prototype += msra::strfun::strprintf("%ls[%s%s {W=%lu, H=%lu, C=%lu}]", child->NodeName().c_str(), string(child->m_sampleLayout).c_str(), mbSizeMark, + child->m_sampleLayout[1], child->m_sampleLayout[2], child->m_sampleLayout[0]); + // BUGBUG: This ^^ will print based on the old legacy layout, and we have no way of knowing here whether that is correct. + else +#endif + prototype += msra::strfun::strprintf("[%s%s]", string(child->m_sampleLayout).c_str(), mbSizeMark); + } + prototype += extraArgs; + //prototype += ")"; + } + + prototype += msra::strfun::strprintf(" -> [%s%s]", string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : ""); + + return prototype; +} + template /*virtual*/ void ComputationNode::DumpNodeInfo(const bool /*printValues*/, const bool printMetadata, File& fstream) const { diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 474fc7e4be3c..07a1928633b6 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -108,7 +108,7 @@ struct /*interface*/ IComputationNode // --- optional overrides for more informative logging - virtual void PrintSelfBeforeValidation() const = 0; // called in validation loop right before Validate() + virtual std::string FormatOperationPrototype(const std::string& extraArgs) const = 0; // format the operation into a "prototype" (listing dimensions and parameters) virtual void DumpNodeInfo(const bool /*printValues*/, const bool /*printMetadata*/, File& fstream) const = 0; protected: @@ -592,7 +592,7 @@ protected: public: // ...the following should be protected, but nodes inquire ab /*HasName::*/ void SetName(const std::wstring& newName) override // also for use by ExperimentalNetworkBuilder { m_nodeName = newName; - fprintf(stderr, "Node --> %ls = %ls\n", NodeName().c_str(), OperationName().c_str()), fflush(stderr); + //fprintf(stderr, "Node --> %ls : %ls\n", NodeName().c_str(), OperationName().c_str()), fflush(stderr); } bool NeedsGradient() const { return m_needsGradient; } @@ -786,36 +786,7 @@ protected: public: // ...the following should be protected, but nodes inquire ab virtual void PrintSelf(bool printMatrices = false) const = 0; // called in validation loop right before Validate() - virtual void /*IComputationNode::*/ PrintSelfBeforeValidation() const - { - fprintf(stderr, "\nValidating --> %ls = %ls", NodeName().c_str(), OperationName().c_str()); - - if (!IsLeaf()) - { - fprintf(stderr, "("); - for (size_t i = 0; i < GetNumInputs(); i++) - { - const auto& child = m_inputs[i]; - if (i > 0) - fprintf(stderr, ", "); - - if (child == nullptr) - { - fprintf(stderr, "NULL"); - continue; - } - - const char* mbSizeMark = child->m_pMBLayout ? " x *" : ""; - if (child->m_sampleLayout.GetRank() == 3 && (child->m_sampleLayout[1] != 1 || child->m_sampleLayout[0] != 1)) // looks like an image: use WHC notation - fprintf(stderr, "%ls[%s%s {W=%lu, H=%lu, C=%lu}]", child->NodeName().c_str(), string(child->m_sampleLayout).c_str(), mbSizeMark, - child->m_sampleLayout[1], child->m_sampleLayout[2], child->m_sampleLayout[0]); - // BUGBUG: This ^^ will print based on the old legacy layout, and we have no way of knowing here whether that is correct. - else - fprintf(stderr, "%ls[%s%s]", child->NodeName().c_str(), string(child->m_sampleLayout).c_str(), mbSizeMark); - } - fprintf(stderr, ")"); - } - } + virtual std::string /*IComputationNode::*/ FormatOperationPrototype(const std::string& extraArgs) const; // helper for topology plot: enumerate arcs that can be reached starting from the current node's children typedef std::pair ComputationArc; @@ -1300,11 +1271,11 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot VerifyDataSize(Value()); } -#ifdef _DEBUG // NaN checks virtual void /*IComputationNode::*/ EndForwardProp() override { Base::EndForwardProp(); +#ifdef _DEBUG #ifdef TRACK_GAP_NANS MaskMissingValueColumnsToZero(FrameRange(m_pMBLayout)); // HasNaN() operates on a whole matrix, so first flatten all gaps to 0 if (Value().HasNan("EndForwardProp")) @@ -1315,9 +1286,10 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot Value().Print(msra::strfun::utf8(NodeName()), 0, min(Value().GetNumRows()-1, 4), 0, min(Value().GetNumCols()-1, 4)); #endif InvalidateMissingValueColumns(FrameRange(m_pMBLayout)); // blast NaNs into columns that are gaps in a packed layout +#endif + // tracing Trace(); } -#endif #if 0 // (keep it around in case we need to add stuff in the future) virtual void /*IComputationNode::*/BeginBackprop() override @@ -1511,10 +1483,9 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot const std::string& valueFormatString) const; void Trace() { - if (m_traceNodeValue) + if (m_traceNodeValue+1) { - const auto shape = GetTensorShape(DetermineElementwiseTensorRank()); - fprintf(stderr, "Trace --> %ls = %ls -> [%s]\n", NodeName().c_str(), OperationName().c_str(), string(shape).c_str()); + fprintf(stderr, "Trace --> %s\n", FormatOperationPrototype("").c_str()); WriteMinibatchWithFormatting(stderr, m_traceNodeValueUpToDim, m_traceNodeValueUpToT, false/*transpose*/, m_traceNodeValueAsCategoryLabel, std::vector(), ""/*sequenceSeparator*/, " "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n "/*sampleSeparator*/, "%13.10f"/*valueFormatString*/); @@ -1720,7 +1691,7 @@ class FlowControlNode : public ComputationNodeBase virtual std::wstring ToString(void) const override { NOT_IMPLEMENTED; } // these are meant to be called during computation, so provide dummy implementations virtual bool RequiresPreCompute() const override { return false; } // return true if the node's value should be computed before the normal training. e.g., mean and invStd of input features. - virtual void PrintSelfBeforeValidation() const override { } + virtual std::string FormatOperationPrototype(const std::string& extraArgs) const override { return ""; } virtual void DumpNodeInfo(const bool /*printValues*/, const bool /*printMetadata*/, File& fstream) const override {} protected: public: // needed in ComputationNetwork::FindInRecurrentLoops(), which really should be part of SEQTraversalFlowControlNode @@ -1847,7 +1818,7 @@ protected: using Base::MarkValueNonSharable; \ using Base::OutputUsedInComputingInputNodesGradients; \ using Base::PrintNodeValuesToFile; \ - using Base::PrintSelfBeforeValidation; \ + using Base::FormatOperationPrototype; \ using Base::ReleaseMatricesAfterBackprop; \ using Base::ReleaseMatricesAfterForwardProp; \ using Base::ReleaseMatrixToPool; \ diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h index 3c5adb033377..f5f60309776c 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.h +++ b/Source/ComputationNetworkLib/ReshapingNodes.h @@ -533,10 +533,9 @@ class RowRepeatNode : public ComputationNode, public NumInputs<1> fstream >> m_numRepeat; } - virtual void PrintSelfBeforeValidation() const override + virtual std::string FormatOperationPrototype(const std::string& extraArgs) const override { - Base::PrintSelfBeforeValidation(); - fprintf(stderr, ", numRepeats=%lu", m_numRepeat); + return Base::FormatOperationPrototype(extraArgs + msra::strfun::strprintf(", numRepeats=%lu", m_numRepeat)); } virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override @@ -847,22 +846,9 @@ class LegacyReshapeNode : public ReinterpretNodeBase m_targetImageLayout.Save(fstream); } - virtual void /*IComputationNode::*/ PrintSelfBeforeValidation() const override + virtual std::string /*IComputationNode::*/ FormatOperationPrototype(const std::string& extraArgs) const override { - fprintf(stderr, "\nValidating --> %ls = %ls", NodeName().c_str(), OperationName().c_str()); - fprintf(stderr, "("); - for (size_t i = 0; i < GetNumInputs(); i++) - { - ComputationNodePtr child = Input(i); - if (i > 0) - fprintf(stderr, ", "); - if (!child) - fprintf(stderr, "NULL"); - else - fprintf(stderr, "%ls[%s%s]", child->NodeName().c_str(), string(child->GetSampleLayout()).c_str(), child->HasMBLayout() ? " x *" : ""); - } - fprintf(stderr, ", NumOfRows=%lu, imageWidth=%lu, imageHeight=%lu, imageChannels=%lu)", m_numTargetRows, m_targetImageLayout[1], m_targetImageLayout[2], m_targetImageLayout[0]); - // BUGBUG: This interpretaion as image dims is only correct for the 'legacy format, not for cudnn. + return Base::FormatOperationPrototype(extraArgs + msra::strfun::strprintf(", NumOfRows=%lu, imageWidth=%lu, imageHeight=%lu, imageChannels=%lu)", m_numTargetRows, m_targetImageLayout[1], m_targetImageLayout[2], m_targetImageLayout[0])); } // TODO: Clarify/resolve the semantic overlap between BeginForwardProp() and UpdateFunctionMBSize(). From 52929f9f265443ab51ced9f89885c7c862d73267 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Fri, 11 Mar 2016 06:56:38 -0800 Subject: [PATCH 07/26] cleaned up log output --- Examples/Text/PennTreebank/Config/rnn.cntk | 11 ++++++----- Source/CNTK/BrainScript/BrainScriptEvaluator.cpp | 2 +- Source/CNTK/BrainScript/BrainScriptParser.cpp | 6 +++--- Source/CNTK/CNTK.cpp | 16 +++++++++------- .../ComputationNetworkEvaluation.cpp | 6 +++--- .../ComputationNetworkScripting.cpp | 4 ---- Source/ComputationNetworkLib/ComputationNode.h | 2 +- Source/SGDLib/SGD.cpp | 16 ++++++++-------- 8 files changed, 31 insertions(+), 32 deletions(-) diff --git a/Examples/Text/PennTreebank/Config/rnn.cntk b/Examples/Text/PennTreebank/Config/rnn.cntk index d7bcf156b059..c158d39e2ac7 100644 --- a/Examples/Text/PennTreebank/Config/rnn.cntk +++ b/Examples/Text/PennTreebank/Config/rnn.cntk @@ -54,11 +54,8 @@ writeWordAndClassInfo = [ train = [ action = "train" - minibatchSize = 128:256:512 # TODO: Why is this here and not inside SGD? traceLevel = 1 epochSize = 0 # (for quick tests, this can be overridden with something small) - defaultHiddenActivity = 0.1 # default value for hidden states--is this used by SimpleNetworkBuilder? - useValidation = true SimpleNetworkBuilder = [ rnnType = "CLASSLSTM" # TODO: camelCase @@ -70,6 +67,7 @@ train = [ initValueScale = 6.0 uniformInit = true layerSizes = "$confVocabSize$:150:200:10000" + defaultHiddenActivity = 0.1 # default value for hidden states addPrior = false addDropoutNodes = false applyMeanVarNorm = false @@ -81,6 +79,7 @@ train = [ ] SGD = [ + minibatchSize = 128:256:512 learningRatesPerSample = 0.1 momentumPerMB = 0 gradientClippingWithTruncation = true @@ -90,6 +89,10 @@ train = [ gradUpdateType = "none" loadBestModel = true + dropoutRate = 0.0 + + #traceNodeNamesReal = AutoName37 # this allows to track a node's value + # settings for Auto Adjust Learning Rate AutoAdjust = [ autoAdjustLR = "adjustAfterEpoch" @@ -102,8 +105,6 @@ train = [ numPrevLearnRates = 5 numBestSearchEpoch = 1 ] - - dropoutRate = 0.0 ] reader = [ diff --git a/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp b/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp index a8b2904d30a7..d333c1c679eb 100644 --- a/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp +++ b/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp @@ -53,7 +53,7 @@ class EvaluationException : public ConfigException __declspec_noreturn static inline void EvaluationError(const wstring &msg, TextLocation where) { - Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); + //Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); throw EvaluationException(msg, where); } diff --git a/Source/CNTK/BrainScript/BrainScriptParser.cpp b/Source/CNTK/BrainScript/BrainScriptParser.cpp index 834fe04334a2..dab31e32b782 100644 --- a/Source/CNTK/BrainScript/BrainScriptParser.cpp +++ b/Source/CNTK/BrainScript/BrainScriptParser.cpp @@ -184,7 +184,7 @@ class CodeSource __declspec_noreturn static void Fail(wstring msg, TextLocation where) { - Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); + //Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); throw CodeSourceException(msg, where); } @@ -375,7 +375,7 @@ class Lexer : public CodeSource private: __declspec_noreturn static void Fail(wstring msg, Token where) { - Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); + //Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); throw LexerException(msg, where.beginLocation); } @@ -606,7 +606,7 @@ class Parser : public Lexer __declspec_noreturn static void Fail(const wstring& msg, Token where) { - Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); + //Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); throw ParseException(msg, where.beginLocation); } diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp index 3db4a6cb5d2f..c35ee681ca53 100644 --- a/Source/CNTK/CNTK.cpp +++ b/Source/CNTK/CNTK.cpp @@ -579,11 +579,13 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is std::string timestamp = TimeDateStamp(); // dump config info - fprintf(stderr, "running on %s at %s\n", GetHostName().c_str(), timestamp.c_str()); - fprintf(stderr, "command line: \n"); + fprintf(stderr, "\nRunning on %s at %s\n", GetHostName().c_str(), timestamp.c_str()); + fprintf(stderr, "Command line: \n"); for (int i = 0; i < argc; i++) fprintf(stderr, "%*s%ls", i > 0 ? 2 : 0, "", argv[i]); // use 2 spaces for better visual separability + fprintf(stderr, "\n\n"); +#if 1 //def _DEBUG // This simply merges all the different config parameters specified (eg, via config files or via command line directly), // and prints it. fprintf(stderr, "\n\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n"); @@ -601,12 +603,12 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n"); config.dumpWithResolvedVariables(); fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n"); +#endif - fprintf(stderr, "Commands: "); + fprintf(stderr, "Commands:"); for (int i = 0; i < command.size(); i++) - { - fprintf(stderr, "%s ", command[i].c_str()); - } + fprintf(stderr, " %s", command[i].c_str()); + fprintf(stderr, "\n"); // run commands std::string type = config(L"precision", "float"); @@ -614,7 +616,7 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is if (config.Exists("type")) InvalidArgument("CNTK: Use of 'type' parameter is deprecated, it is called 'precision' now."); - fprintf(stderr, "\nPrecision = \"%s\"\n", type.c_str()); + fprintf(stderr, "Precision = \"%s\"\n", type.c_str()); if (type == "float") DoCommands(config); else if (type == "double") diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp index 8615dce8edd7..957c257da5fa 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp @@ -430,7 +430,7 @@ void ComputationNetwork::CompileNetwork() // STEP: Some final details. ResetEvalTimeStamps(); // invalidate all m_value fields. Really belongs into StartEvaluateMinibatchLoop() - fprintf(stderr, "\nPost-processing network complete.\n"); + fprintf(stderr, "\nPost-processing network complete.\n\n"); m_isCompiled = true; } @@ -531,11 +531,11 @@ void ComputationNetwork::ValidateNetwork() size_t toValidate = nodes.size(); while (toValidate > 0) { - fprintf(stderr, "\n\nValidating network. %d nodes to process in pass %d.\n", (int) toValidate, (int) pass); + fprintf(stderr, "\nValidating network. %d nodes to process in pass %d.\n\n", (int) toValidate, (int) pass); toValidate = ValidateNodes(nodes, /*isFirstPass=*/pass == 1, false /*isFinalValidationPass*/); pass++; } - fprintf(stderr, "\n\nValidating network, final pass.\n"); + fprintf(stderr, "\nValidating network, final pass.\n\n"); toValidate = ValidateNodes(nodes, /*isFirstPass=*/pass == 1, true /*isFinalValidationPass*/); if (toValidate != 0) LogicError("ValidateSubNetwork: ValidateNodes(true) unexpectedly returned with work left to do."); diff --git a/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp b/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp index acc3d004ab2d..e7f1be1803a9 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp @@ -107,10 +107,6 @@ ComputationNetwork::ComputationNetwork(const IConfigRecordPtr configp) // perform all necessary post-processing CompileNetwork(); -#if 1 - wstring args = ToString(); - fprintf(stderr, "%ls\n", args.c_str()); -#endif } // =================================================================== diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 07a1928633b6..2f859c1d6495 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -1483,7 +1483,7 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot const std::string& valueFormatString) const; void Trace() { - if (m_traceNodeValue+1) + if (m_traceNodeValue) { fprintf(stderr, "Trace --> %s\n", FormatOperationPrototype("").c_str()); WriteMinibatchWithFormatting(stderr, m_traceNodeValueUpToDim, m_traceNodeValueUpToT, false/*transpose*/, m_traceNodeValueAsCategoryLabel, std::vector(), diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp index 7c725a1e20ca..82404e2f2b5b 100644 --- a/Source/SGDLib/SGD.cpp +++ b/Source/SGDLib/SGD.cpp @@ -49,21 +49,21 @@ void SGD::Train(function createN } wstring modelFileName = GetModelNameForEpoch(int(startEpoch) - 1); - bool loadNetworkFromCheckpoint = false; - if (startEpoch >= 0) - { - loadNetworkFromCheckpoint = true; - fprintf(stderr, "Starting from checkpoint. Loading network from '%ls'.\n", modelFileName.c_str()); - } + bool loadNetworkFromCheckpoint = startEpoch >= 0; + if (loadNetworkFromCheckpoint) + fprintf(stderr, "\nStarting from checkpoint. Loading network from '%ls'.\n", modelFileName.c_str()); + else + fprintf(stderr, "\nCreating virgin network.\n"); // create or load from checkpoint shared_ptr net = !loadNetworkFromCheckpoint ? createNetworkFn(deviceId) : ComputationNetwork::CreateFromFile(deviceId, modelFileName); // log the device we are computing on + fprintf(stderr, "%s model with %d nodes", loadNetworkFromCheckpoint ? "Loaded" : "Created", (int)net->GetTotalNumberOfNodes()); if (net->GetDeviceId() < 0) - fprintf(stderr, "\nSGD using CPU.\n"); + fprintf(stderr, " on CPU.\n"); else - fprintf(stderr, "\nSGD using GPU %d.\n", (int) net->GetDeviceId()); + fprintf(stderr, " on GPU %d.\n", (int) net->GetDeviceId()); // TODO: BUGBUG: if not starting from checkpoint, need to synchronize initial model // strategy should be to run the initializer above on mpiRank==0, and then broadcast parameters. From 63c50f6364a72c2bd3ea9d0c35fc186ec6991641 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Fri, 11 Mar 2016 16:57:14 -0800 Subject: [PATCH 08/26] added new node Where (only BS) --- Makefile | 1 + .../CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs | 1 + Source/Common/Include/Sequences.h | 16 ++ .../ComputationNetworkBuilder.cpp | 1 + .../ComputationNetworkLib.vcxproj | 3 +- .../ComputationNetworkLib.vcxproj.filters | 3 + .../ComputationNetworkLib/ComputationNode.h | 1 + .../LinearAlgebraNodes.h | 37 +--- .../ComputationNetworkLib/NonlinearityNodes.h | 16 +- Source/ComputationNetworkLib/RecurrentNodes.h | 16 +- .../ComputationNetworkLib/ReshapingNodes.cpp | 175 ++++++++++++++++++ Source/ComputationNetworkLib/ReshapingNodes.h | 60 +++--- .../SpecialPurposeNodes.h | 16 +- Source/ComputationNetworkLib/TrainingNodes.h | 16 +- Source/Math/CPUMatrix.cpp | 13 +- Source/Math/Matrix.h | 45 ++--- 16 files changed, 269 insertions(+), 151 deletions(-) create mode 100644 Source/ComputationNetworkLib/ReshapingNodes.cpp diff --git a/Makefile b/Makefile index 69cdbf3654e2..4aeac0b21b35 100644 --- a/Makefile +++ b/Makefile @@ -529,6 +529,7 @@ CNTK_SRC =\ $(SOURCEDIR)/CNTK/SynchronousExecutionEngine.cpp \ $(SOURCEDIR)/CNTK/tests.cpp \ $(SOURCEDIR)/ComputationNetworkLib/ComputationNode.cpp \ + $(SOURCEDIR)/ComputationNetworkLib/ReshapingNodes.cpp \ $(SOURCEDIR)/ComputationNetworkLib/ComputationNetwork.cpp \ $(SOURCEDIR)/ComputationNetworkLib/ComputationNetworkEvaluation.cpp \ $(SOURCEDIR)/ComputationNetworkLib/ComputationNetworkAnalysis.cpp \ diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs index c0757963f158..97498816251d 100644 --- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs +++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs @@ -98,6 +98,7 @@ SumElements(matrix, tag='') = new ComputationNode [ operation = 'SumElements' ; Tanh(z, tag='') = new ComputationNode [ operation = 'Tanh' ; inputs = z /*plus the function args*/ ] TimeReverse(vectorSequence, tag='') = new ComputationNode [ operation = 'TimeReverse' ; inputs = vectorSequence /*plus the function args*/ ] TransposeTimes(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'TransposeTimes' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ] +Where(cond, tag='') = new ComputationNode [ operation = 'Where' ; inputs = cond /*plus the function args*/ ] ############################################################################## # common macros diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h index 286c9ac4f3f6..98f98aa51866 100644 --- a/Source/Common/Include/Sequences.h +++ b/Source/Common/Include/Sequences.h @@ -364,6 +364,22 @@ struct MBLayout return false; } + // ------------------------------------------------------------------- + // indexing + // ------------------------------------------------------------------- + + // get the matrix-column index for a given time step in a given sequence + size_t GetColumnIndex(const SequenceInfo& seq, size_t t) const + { + if (t > seq.GetNumTimeSteps()) + LogicError("GetColumnIndex: t out of sequence bounds."); + ptrdiff_t tIn = (ptrdiff_t)t + seq.tBegin; + if (tIn < 0 || (size_t)tIn >= GetNumTimeSteps()) + LogicError("GetColumnIndex: Attempted to access a time step that is accessing a portion of a sequence that is not included in current minibatch."); // we may encounter this for truncated BPTT + size_t col = (size_t)tIn * GetNumParallelSequences() + seq.s; + return (size_t)col; + } + private: // we are trying to access content--this verifies that the structure is consistent // All frames must now be declared. diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp index 35f8ce459712..a623d8da72bd 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp @@ -95,6 +95,7 @@ static shared_ptr> CreateStandardNode(const std::wstri else if (nodeType == OperationNameOf(TimesNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(TransposeDimensionsNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(TransposeTimesNode)) return New>(forward<_Types>(_Args)...); + else if (nodeType == OperationNameOf(WhereNode)) return New>(forward<_Types>(_Args)...); // legacy names we also support for back compat of model-files else if (nodeType == L"ColumnElementTimes") return New>(forward<_Types>(_Args)...); else if (nodeType == L"Delay") return New>(forward<_Types>(_Args)...); diff --git a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj index 79200885a378..2c08f30899a9 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj +++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj @@ -170,8 +170,9 @@ + - + \ No newline at end of file diff --git a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters index b6cac5de7e22..ac39ae84bf79 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters +++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters @@ -37,6 +37,9 @@ Network + + Nodes + diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 2f859c1d6495..b42e1098ca17 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -276,6 +276,7 @@ class ComputationNodeBase : public IComputationNode, ComputationNodeBase(DEVICEID_TYPE deviceId, const wstring& name) : m_deviceId(deviceId), m_outputNeededDuringBackprop(true), m_learningRateMultiplier(0), m_gradientInitialized(false), m_nodeName(name == L"" ? CreateUniqNodeName() : name) { + // TODO: should m_learningRateMultiplier be set to 0? Or should every node have a way to add its own say on the learning rate for all its inputs? } virtual ~ComputationNodeBase() { diff --git a/Source/ComputationNetworkLib/LinearAlgebraNodes.h b/Source/ComputationNetworkLib/LinearAlgebraNodes.h index 064e1bf806a3..bf4b9f22e461 100644 --- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h +++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h @@ -137,20 +137,8 @@ class NegateNode : public ComputationNode, public NumInputs<1> Input(0)->GradientFor(fr) -= GradientFor(fr); } - virtual bool OutputUsedInComputingInputNodesGradients() const override - { - // The NegateNode does not require its output value for computing - // the gradients of its input nodes - return false; - } - - virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override - { - // The NegateNode does not require any of it's input's values for computing - // the gradients of its input nodes - UNREFERENCED_PARAMETER(childIndex); - return false; - } + virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override { @@ -460,10 +448,7 @@ class ElementTimesNode : public BinaryElementWiseNode inputGradient.AddElementwiseProductOf(gradient, otherInputValue); } - virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override - { - return true; - } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return true; } virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override { @@ -689,20 +674,8 @@ class SumColumnElementsNode : public ComputationNode, public NumInputs sliceInputGrad += sliceOutputGrad; // here the assumption is that sliceOutputGrad is a row vector } - virtual bool OutputUsedInComputingInputNodesGradients() const override - { - // The SumColumnElementsNode does not require its output value for computing - // the gradients of its input nodes - return false; - } - - virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override - { - // The SumColumnElementsNode does not require any of it's input's values for computing - // the gradients of its input nodes - UNREFERENCED_PARAMETER(childIndex); - return false; - } + virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override { diff --git a/Source/ComputationNetworkLib/NonlinearityNodes.h b/Source/ComputationNetworkLib/NonlinearityNodes.h index 983ea594630b..a2cd5421d631 100644 --- a/Source/ComputationNetworkLib/NonlinearityNodes.h +++ b/Source/ComputationNetworkLib/NonlinearityNodes.h @@ -230,13 +230,7 @@ class SoftmaxNode : public SoftmaxNodeBase { } - virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override - { - // The plus node does not require any of it's input's values for computing - // the gradients of its input nodes - UNREFERENCED_PARAMETER(childIndex); - return false; - } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } /*virtual*/ void BackpropToV(Matrix& gradient, const Matrix& inputFunctionValues, Matrix& inputGradientValues, const Matrix& gradientValues, const Matrix& functionValues) { @@ -304,13 +298,7 @@ class LogSoftmaxNode : public SoftmaxNodeBase { } - virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override - { - // The plus node does not require any of it's input's values for computing - // the gradients of its input nodes - UNREFERENCED_PARAMETER(childIndex); - return false; - } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } /*virtual*/ void BackpropToV(Matrix& gradient, const Matrix& inputFunctionValues, Matrix& inputGradientValues, const Matrix& gradientValues, const Matrix& functionValues) { diff --git a/Source/ComputationNetworkLib/RecurrentNodes.h b/Source/ComputationNetworkLib/RecurrentNodes.h index d9203c1ebeeb..feb7e485a20b 100644 --- a/Source/ComputationNetworkLib/RecurrentNodes.h +++ b/Source/ComputationNetworkLib/RecurrentNodes.h @@ -232,20 +232,8 @@ class DelayedValueNodeBase : public ComputationNode, public IRecurrent } } - virtual bool OutputUsedInComputingInputNodesGradients() const override - { - // The DelayedValueNode does not require its output value for computing - // the gradients of its input nodes - return false; - } - - virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override - { - // The DelayedValueNode does not require any of it's input's values for computing - // the gradients of its input nodes - UNREFERENCED_PARAMETER(childIndex); - return false; - } + virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } virtual void EndForwardProp() override // called after last iteration step of ForwardProp() { diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp new file mode 100644 index 000000000000..dc1734a44987 --- /dev/null +++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp @@ -0,0 +1,175 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. +// +// ReshapingNodes.cpp -- collection of nodes that reshape or sub-sample matrices leading to layout changes +// + +#include "Basics.h" +#include "ReshapingNodes.h" +#include "Matrix.h" +#include "ComputationNode.h" +#include "Sequences.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Microsoft { namespace MSR { namespace CNTK { + +// ----------------------------------------------------------------------- +// Where(bitVector) -- extract indices of non-0 values in a sequence +// ----------------------------------------------------------------------- + +// TODO: move to MBLayout as a static method +// packing algorithm +// - width: maximum width of structure; set to maximum over sequence lengths +// - inputSequences: vector of input SequenceInfo records (only seqId and GetNumTimeSteps() are used) +// - [out] *pMBLayout: MBLayout that describes the created packed sequence set +// - placement, rowAllocations: temp buffers (passed in to be able to optimize memory allocations) +template +static void PackSequences(const SequenceInfoVector& inputSequences, + /*ref->out*/MBLayoutPtr pMBLayout, + /*temp buffer*/std::vector>& placement, + /*temp buffer*/std::vector rowAllocations) +{ + placement.resize(inputSequences.size()); // [sequence index] result goes here (entries are invalid for gaps) + // determine width of MBLayout + size_t width = 0; + for (size_t i = 0; i < inputSequences.size(); i++) + if (inputSequences[i].seqId == GAP_SEQUENCE_ID) + continue; + else if (width < inputSequences[i].GetNumTimeSteps()) + width = inputSequences[i].GetNumTimeSteps(); + // allocate + rowAllocations.clear(); // [row] we build rows one by one + for (size_t i = 0; i < inputSequences.size(); i++) + { + if (inputSequences[i].seqId == GAP_SEQUENCE_ID) + continue; + let len = inputSequences[i].GetNumTimeSteps(); + // first see if we find a row that has enough space + size_t s; + for (s = 0; s < rowAllocations.size(); s++) + if (rowAllocations[s] + len <= width) + break; // yep, it fits + // we did not find a s that fit then create a new one + if (s == rowAllocations.size()) + rowAllocations.push_back(0); + // sequence goes to (s, rowAllocations[s]) + placement[i] = make_pair(s, rowAllocations[s]); + // and allocate it + rowAllocations[s] += len; + } + // create MBLayout + pMBLayout->Init(rowAllocations.size(), width); + for (size_t i = 0; i < inputSequences.size(); i++) + { + if (inputSequences[i].seqId == GAP_SEQUENCE_ID) + continue; + size_t s, tBegin; tie + (s, tBegin) = placement[i]; + pMBLayout->AddSequence(inputSequences[i].seqId, s, (ptrdiff_t)tBegin, tBegin + inputSequences[i].GetNumTimeSteps()); + } + // need to fill the gaps as well + for (size_t s = 0; s < rowAllocations.size(); s++) + pMBLayout->AddGap(s, (size_t)rowAllocations[s], width); +} + +// wrapper class to pass MBLayout sequence vector to PackSequences() +struct SequenceLengthVector +{ + typedef vector> SequenceVector; + typedef MBLayout::SequenceInfo SequenceInfo; + const SequenceVector& sequenceVector; // + const vector& sequenceInfo; // original sequence info (for seqId) + SequenceLengthVector(const vector& sequenceInfo, const SequenceVector& sequenceVector) : sequenceInfo(sequenceInfo), sequenceVector(sequenceVector) { } + size_t size() const { return sequenceInfo.size(); } + MBLayout::SequenceInfo operator[](size_t i) const // return a descriptor of the new sequence + { + SequenceInfo seq; + seq.seqId = sequenceInfo[i].seqId; + seq.s = i; + seq.tBegin = 0; + seq.tEnd = sequenceVector[i].size(); + return seq; + } + void operator=(const SequenceLengthVector&) = delete; +}; + +// TODO: Where should the MBLayout be created--in BeginForwardProp() or ForwardProp()? +// BeginForwardProp() should generally have no access to the actual values, +// while ForwardProp() might be too late. We may have to define the semantics here. +// BUGBUG: This is the first node with value-dependent MBLayout. It resizes Value(), which we otherwise always do before. +template +/*virtual*/ void WhereNode::ForwardPropNonLooping() /*override*/ +{ + // gather all sequences + let& inMBLayout = Input(0)->GetMBLayout(); + let& input = Input(0)->Value(); + let& sequences = inMBLayout->GetAllSequences(); + auto& indexSequences = m_indexSequenceBuffer; + if (indexSequences.size() < sequences.size()) + indexSequences.resize(sequences.size()); + for (size_t i = 0; i < sequences.size(); i++) + { + let& seq = sequences[i]; + if (seq.seqId == GAP_SEQUENCE_ID) + continue; + auto& indexSequence = indexSequences[i]; + indexSequence.clear(); + for (size_t t = 0; t < seq.GetNumTimeSteps(); t++) + if (input(0, inMBLayout->GetColumnIndex(seq, t))) // this is the condition check that this node performs; the meat + indexSequence.push_back(t); + // Note: The above accesses m_value directly on the CPU, putting it into BOTH state, possibly for other consumers as well. + } + // create a new MBLayout + let& outMBLayout = GetMBLayout(); + PackSequences(SequenceLengthVector(sequences, indexSequences), outMBLayout, /*temp*/m_placementBuffer, /*temp*/m_rowAllocationsBuffer); + // copy to output + vector buf(outMBLayout->GetNumCols(), numeric_limits::quiet_NaN()); // STL cannot easily avoid initializing, so we might as well init with NaN for gaps + for (size_t i = 0; i < sequences.size(); i++) + { + let& seq = outMBLayout->GetAllSequences()[i]; + if (seq.seqId == GAP_SEQUENCE_ID) // gaps will keep the NaN + continue; + let& indexSequence = indexSequences[i]; + for (size_t t = 0; t < seq.GetNumTimeSteps(); t++) + buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType)indexSequence[t]; + } + Value().SetValue(outMBLayout->GetNumParallelSequences(), outMBLayout->GetNumTimeSteps(), Input(0)->Value().GetDeviceId(), buf.data(), MatrixFormat::matrixFormatColMajor); +} + +template +/*virtual*/ void WhereNode::BackpropToNonLooping(size_t /*inputIndex*/) /*override*/ +{ + // we cannot backprop through a condition + // Can we? + return; +} + +template +/*virtual*/ void WhereNode::Validate(bool isFinalValidationPass) /*override*/ +{ + ComputationNodeBase::Validate(isFinalValidationPass); + // we generate its own MBLayout + if (isFinalValidationPass && !Input(0)->HasMBLayout()) + InvalidArgument("%ls %ls operation can only operate on minibatch data (which have a layout).", NodeName().c_str(), OperationName().c_str()); + if (!m_pMBLayout) + m_pMBLayout = make_shared(); // this generates a new layout + // we map scalars to scalars + if (isFinalValidationPass && Input(0)->GetSampleLayout().GetNumElements() != 1) + InvalidArgument("%ls %ls operation can only operate on scalar input.", NodeName().c_str(), OperationName().c_str()); + SetDims(TensorShape(1), true); +} + +template class WhereNode; +template class WhereNode; + +}}} diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h index f5f60309776c..273d8260ea9f 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.h +++ b/Source/ComputationNetworkLib/ReshapingNodes.h @@ -561,20 +561,8 @@ class RowRepeatNode : public ComputationNode, public NumInputs<1> Input(0)->GradientFor(fr).AddToRowRepeatValuesOf(GradientFor(fr), m_numRepeat); } - virtual bool OutputUsedInComputingInputNodesGradients() const override - { - // The RowRepeatNode does not require its output value for computing - // the gradients of its input nodes - return false; - } - - virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override - { - // The RowRepeatNode does not require any of it's input's values for computing - // the gradients of its input nodes - UNREFERENCED_PARAMETER(childIndex); - return false; - } + virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } private: size_t m_numRepeat; @@ -584,19 +572,49 @@ template class RowRepeatNode; template class RowRepeatNode; // ----------------------------------------------------------------------- -// DiagonalNode -- extract diagonal elements of a square matrix into a row vector +// WhereNode -- extract indices of non-0 values in a sequence +// As this implies a runtime-vale dependent reduction in dimension, it can +// only be applied to time sequences, and not other tensor dimensions. +// The result will have a different MBLayout reflecting the shortened result sequences. // ----------------------------------------------------------------------- template -class DiagonalNode : public ComputationNodeNonLooping, public NumInputs<1> +class WhereNode : public ComputationNodeNonLooping, public NumInputs<1> { - typedef ComputationNodeNonLooping Base; - UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() + typedef ComputationNodeNonLooping Base; UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() { return L"Where"; } + +public: + DeclareConstructorFromConfigWithNumInputs(WhereNode); + WhereNode(DEVICEID_TYPE deviceId, const wstring& name) : + Base(deviceId, name) { - return L"Diagonal"; + m_learningRateMultiplier = 0.0f; // we cannot backprop; this will disable it + // TODO: This ^^ is a bit of a hack. Do we need a better mechanism for nodes to tell that they cannot backprop? We will have more of those. + // This might even not work, need to track down how this is inferred/propagated upwards. It is really only for LearnableParameters. } + virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override; + virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override; + virtual void Validate(bool isFinalValidationPass) override; + +private: + // buffers for creating the result sequences (kept as object state to avoid memory allocations) + std::vector> m_indexSequenceBuffer; // [sequenceIndex][t] for creating the result sequences + std::vector m_rowAllocationsBuffer; // [row] for determining new MBLayout packing + std::vector> m_placementBuffer; // [sequenceIndex] assigned location for a sequence +}; + +// ----------------------------------------------------------------------- +// DiagonalNode -- extract diagonal elements of a square matrix into a row vector +// ----------------------------------------------------------------------- + +template +class DiagonalNode : public ComputationNodeNonLooping, public NumInputs<1> +{ + typedef ComputationNodeNonLooping Base; UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() { return L"Diagonal"; } + public: DeclareConstructorFromConfigWithNumInputs(DiagonalNode); DiagonalNode(DEVICEID_TYPE deviceId, const wstring& name) @@ -642,7 +660,7 @@ class DiagonalNode : public ComputationNodeNonLooping, public NumInput m_pMBLayout = nullptr; if (isFinalValidationPass && Input(0)->HasMBLayout()) - InvalidArgument("%ls %ls operation cannot operate on minibatch data (which have a layout)", NodeName().c_str(), OperationName().c_str()); + InvalidArgument("%ls %ls operation cannot operate on minibatch data (which have a layout).", NodeName().c_str(), OperationName().c_str()); size_t dim = Input(0)->GetAsMatrixNumCols(); if (isFinalValidationPass && dim != Input(0)->GetAsMatrixNumRows()) diff --git a/Source/ComputationNetworkLib/SpecialPurposeNodes.h b/Source/ComputationNetworkLib/SpecialPurposeNodes.h index 438cb48fe790..20f3847ceaea 100644 --- a/Source/ComputationNetworkLib/SpecialPurposeNodes.h +++ b/Source/ComputationNetworkLib/SpecialPurposeNodes.h @@ -106,20 +106,8 @@ class GMMLogLikelihoodNode : public ComputationNode, public NumInputs< } } - virtual bool OutputUsedInComputingInputNodesGradients() const override - { - // The GMMLogLikelihoodNode does not require its output value for computing - // the gradients of its input nodes - return false; - } - - virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override - { - // The GMMLogLikelihoodNode does not require any of it's input's values for computing - // the gradients of its input nodes - UNREFERENCED_PARAMETER(childIndex); - return false; - } + virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } void BackpropToUnnormedPrior(Matrix& unnormedPriorGradientValues, const Matrix& gradientValues, const Matrix& prior, const Matrix& posterior, Matrix& temp) diff --git a/Source/ComputationNetworkLib/TrainingNodes.h b/Source/ComputationNetworkLib/TrainingNodes.h index 8ffeb4e5197a..1450f1787dae 100644 --- a/Source/ComputationNetworkLib/TrainingNodes.h +++ b/Source/ComputationNetworkLib/TrainingNodes.h @@ -1454,20 +1454,8 @@ class DropoutNode : public ComputationNode, public NumInputs<1> sliceInput0Grad += sliceOutputGrad; } - virtual bool OutputUsedInComputingInputNodesGradients() const override - { - // The DropoutNode does not require its output value for computing - // the gradients of its input nodes - return false; - } - - virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override - { - // The DropoutNode does not require any of it's input's values for computing - // the gradients of its input nodes - UNREFERENCED_PARAMETER(childIndex); - return false; - } + virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } virtual void UpdateFunctionMBSize() override { diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp index 4e2211c57f3e..e7b20d426e4d 100644 --- a/Source/Math/CPUMatrix.cpp +++ b/Source/Math/CPUMatrix.cpp @@ -852,16 +852,10 @@ void CPUMatrix::SetValue(const size_t numRows, const size_t numCols, E { Resize(numRows, numCols); - if (IsEmpty()) + if (!IsEmpty()) { - InvalidArgument("NumRows or NumCols is 0. Nothing to copy"); - } - else - { - if (!(matrixFlags & matrixFormatRowMajor)) // compatible to internal structure - { + if (!(matrixFlags & matrixFormatRowMajor)) // compatible with internal structure memcpy(m_pArray, pArray, GetNumElements() * sizeof(ElemType)); - } else // need to transpose { auto& us = *this; @@ -900,9 +894,6 @@ void CPUMatrix::SetValue(const size_t numRows, const size_t numCols, E template void CPUMatrix::SetDiagonalValue(const ElemType v) { - if (IsEmpty()) - LogicError("SetDiagonalValue: Matrix is empty."); - if (GetNumRows() != GetNumCols()) LogicError("SetDiagonalValue: NumRows and NumCols do not agree."); diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index aabf295f95e5..a900d0042dbb 100644 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -124,27 +124,12 @@ class MATH_API Matrix : public MatrixBase void ShallowCopyFrom(const Matrix& other); public: - MatrixType GetMatrixType() const - { - return m_matrixType; - } - MatrixFormat GetFormat() const - { - return m_baseMatrix->GetFormat(); - } - bool OwnBuffer() const - { - return m_baseMatrix->OwnBuffer(); - } + MatrixType GetMatrixType() const { return m_matrixType; } + MatrixFormat GetFormat() const { return m_baseMatrix->GetFormat(); } + bool OwnBuffer() const { return m_baseMatrix->OwnBuffer(); } int GetDeviceId() const; // -1 if CPU, otherwise GPU CUDA device id - DEVICEID_TYPE GetPreferredDeviceId() const - { - return m_preferredDeviceId; - }; // -1 if CPU, otherwise GPU CUDA device id - void SetPreferredDeviceId(DEVICEID_TYPE preferredDeviceId) - { - m_preferredDeviceId = preferredDeviceId; - } + DEVICEID_TYPE GetPreferredDeviceId() const { return m_preferredDeviceId; }; // -1 if CPU, otherwise GPU CUDA device id + void SetPreferredDeviceId(DEVICEID_TYPE preferredDeviceId) { m_preferredDeviceId = preferredDeviceId; } // Moves matrix from device id_from to device with id_to. // If emptyTransfer=true, then no data is ever moved, just corresponding GPU/CPU matrices are deleted and then created using empty constructor void TransferFromDeviceToDevice(int id_from, int id_to, bool ismoved = false, /*if false then keep source and set location to BOTH*/ bool emptyTransfer = false, bool updatePreferredDevice = true) const; @@ -235,12 +220,12 @@ class MATH_API Matrix : public MatrixBase void SetValue(const Matrix& deepCopyFrom, const MatrixFormat format = matrixFormatSparseCSR); // BUGBUG: default for 'format' is unexpected void SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, const size_t matrixFlags = matrixFlagNormal); void SetValue(const size_t rIdx, const size_t cIdx, ElemType val); // set matrix sparsely - void SetValue(const size_t numRows, const size_t numCols, std::initializer_list l) + void SetValue(const size_t numRows, const size_t numCols, std::initializer_list l) // SetValue(2,3, {1,2,3, 4,5,6}); { std::vector vals(l); assert(vals.size() == numRows * numCols); SetValue(numRows, numCols, GetDeviceId(), vals.data(), matrixFormatRowMajor); - } // SetValue(2,3, {1,2,3, 4,5,6}); + } static ElemType MakeNan(size_t payload); void Invalidate() { @@ -271,35 +256,35 @@ class MATH_API Matrix : public MatrixBase Matrix& AssignTransposeOf(const Matrix& a); Matrix& operator+=(const ElemType alpha); - Matrix operator+(const ElemType alpha) const; + Matrix operator+(const ElemType alpha) const; Matrix& AssignSumOf(const ElemType alpha, const Matrix& a); Matrix& operator+=(const Matrix& a); - Matrix operator+(const Matrix& a) const; + Matrix operator+(const Matrix& a) const; Matrix& AssignSumOf(const Matrix& a, const Matrix& b); Matrix& operator-=(const ElemType alpha); - Matrix operator-(const ElemType alpha) const; + Matrix operator-(const ElemType alpha) const; Matrix& AssignDifferenceOf(const ElemType alpha, const Matrix& a); Matrix& AssignDifferenceOf(const Matrix& a, const ElemType alpha); Matrix& operator-=(const Matrix& a); - Matrix operator-(const Matrix& a) const; + Matrix operator-(const Matrix& a) const; Matrix& AssignDifferenceOf(const Matrix& a, const Matrix& b); Matrix& operator*=(const ElemType alpha); - Matrix operator*(const ElemType alpha) const; + Matrix operator*(const ElemType alpha) const; Matrix& AssignProductOf(const ElemType alpha, const Matrix& a); - Matrix operator*(const Matrix& a) const; + Matrix operator*(const Matrix& a) const; Matrix& AssignProductOf(const Matrix& a, const bool transposeA, const Matrix& b, const bool transposeB); // this = a * b Matrix& Assign1x1ProductOf(const Matrix& a1x1, const Matrix& b); // this = a * b, where a is 1x1 Matrix& operator/=(ElemType alpha); - Matrix operator/(ElemType alpha) const; + Matrix operator/(ElemType alpha) const; Matrix& operator^=(ElemType alpha); // element-wise power - Matrix operator^(ElemType alpha) const; // element-wise power + Matrix operator^(ElemType alpha) const; // element-wise power Matrix& AssignElementPowerOf(const Matrix& a, const ElemType power); // TODO: There are several functions below that perform an in-place operation From d9fecc5fbc779ce7f7a677267d6059e5da1af0d9 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Fri, 11 Mar 2016 17:00:13 -0800 Subject: [PATCH 09/26] WhereNode should keep its result on the CPU --- Source/ComputationNetworkLib/ReshapingNodes.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp index dc1734a44987..36e93d943f2b 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.cpp +++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp @@ -143,7 +143,8 @@ template for (size_t t = 0; t < seq.GetNumTimeSteps(); t++) buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType)indexSequence[t]; } - Value().SetValue(outMBLayout->GetNumParallelSequences(), outMBLayout->GetNumTimeSteps(), Input(0)->Value().GetDeviceId(), buf.data(), MatrixFormat::matrixFormatColMajor); + // the result will be kept in CPUDEVICE, since most likely we will access it again in PackedIndexNode + Value().SetValue(outMBLayout->GetNumParallelSequences(), outMBLayout->GetNumTimeSteps(), CPUDEVICE, buf.data(), MatrixFormat::matrixFormatColMajor); } template From a15d8d86a2911155934215e8a19432a247f2b047 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Fri, 11 Mar 2016 18:23:09 -0800 Subject: [PATCH 10/26] added new node PackedIndex(); bug fix in stock output writer, must skip gap sequences --- .../CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs | 4 ++ Source/Common/Include/Sequences.h | 9 ++- .../ComputationNetworkBuilder.cpp | 4 +- .../ComputationNetworkLib/ComputationNode.cpp | 6 +- .../ComputationNetworkLib/ReshapingNodes.cpp | 57 ++++++++++++++++++- Source/ComputationNetworkLib/ReshapingNodes.h | 37 +++++++++++- 6 files changed, 109 insertions(+), 8 deletions(-) diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs index 97498816251d..9ae9ef974646 100644 --- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs +++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs @@ -64,12 +64,14 @@ Delay = PastValue BatchNormalization(input, scale, bias, runMean, runInvStdDev, eval, spatial, normalizationTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ] ClassBasedCrossEntropyWithSoftmax(labelClassDescriptorVectorSequence, mainInputInfo, mainWeight, classLogProbsBeforeSoftmax, tag='') = new ComputationNode [ operation = 'ClassBasedCrossEntropyWithSoftmax' ; inputs = (labelClassDescriptorVectorSequence : mainInputInfo : mainWeight : classLogProbsBeforeSoftmax) /*plus the function args*/ ] ColumnElementTimes(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'ColumnElementTimes' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ] +// TODO: ColumnElementTimes = ElementTimes CosDistance(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'CosDistance' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ] CosDistanceWithNegativeSamples(aVectorSequence, anotherVectorSequence, numShifts, numNegSamples, tag='') = new ComputationNode [ operation = 'CosDistanceWithNegativeSamples' ; inputs = (aVectorSequence : anotherVectorSequence : numShifts : numNegSamples) /*plus the function args*/ ] Cosine(x, tag='') = new ComputationNode [ operation = 'Cosine' ; inputs = x /*plus the function args*/ ] CrossEntropy(refProbVectorSequence, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropy' ; inputs = (refProbVectorSequence : outProbVectorSequence) /*plus the function args*/ ] CrossEntropyWithSoftmax(labelVectorSequence, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropyWithSoftmax' ; inputs = (labelVectorSequence : outProbVectorSequence) /*plus the function args*/ ] DiagTimes(diagonalMatrixAsColumnVector, matrix, tag='') = new ComputationNode [ operation = 'DiagTimes' ; inputs = (diagonalMatrixAsColumnVector : matrix) /*plus the function args*/ ] +// TODO: DiagTimes = ElementTimes Dropout(activationVectorSequence, tag='') = new ComputationNode [ operation = 'Dropout' ; inputs = activationVectorSequence /*plus the function args*/ ] ElementTimes(aMatrix, anotherMatrix, tag='') = new ComputationNode [ operation = 'ElementTimes' ; inputs = (aMatrix : anotherMatrix) /*plus the function args*/ ] ErrorPrediction(labelVectorSequence, outVectorSequence, tag='') = new ComputationNode [ operation = 'ErrorPrediction' ; inputs = (labelVectorSequence : outVectorSequence) /*plus the function args*/ ] @@ -84,11 +86,13 @@ MatrixL2Reg(matrix, tag='') = new ComputationNode [ operation = 'MatrixL2Reg' ; Mean(dataVectorSequence, tag='') = new ComputationNode [ operation = 'Mean' ; inputs = dataVectorSequence /*plus the function args*/ ] Minus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'Minus' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ] Negate(input, tag='') = new ComputationNode [ operation = 'Negate' ; inputs = input /*plus the function args*/ ] +PackedIndex(targetObject, indexSequence, tag='') = new ComputationNode [ operation = 'PackedIndex' ; inputs = (targetObject : indexSequence) /*plus the function args*/ ] PerDimMeanVarDeNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarDeNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ] PerDimMeanVarNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ] Plus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'Plus' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ] RectifiedLinear(z, tag='') = new ComputationNode [ operation = 'RectifiedLinear' ; inputs = z /*plus the function args*/ ] Scale(scalarScalingFactor, matrix, tag='') = new ComputationNode [ operation = 'Scale' ; inputs = (scalarScalingFactor : matrix) /*plus the function args*/ ] +// TODO: Scale = ElementTimes Sigmoid(z, tag='') = new ComputationNode [ operation = 'Sigmoid' ; inputs = z /*plus the function args*/ ] Softmax(z, tag='') = new ComputationNode [ operation = 'Softmax' ; inputs = z /*plus the function args*/ ] Hardmax(z, tag='') = new ComputationNode [ operation = 'Hardmax' ; inputs = z /*plus the function args*/ ] diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h index 98f98aa51866..72b2c02ffdd7 100644 --- a/Source/Common/Include/Sequences.h +++ b/Source/Common/Include/Sequences.h @@ -328,7 +328,7 @@ struct MBLayout } // find a sequence by its id - const SequenceInfo &FindSequence(UniqueSequenceId seqId) const + const SequenceInfo& FindSequence(UniqueSequenceId seqId) const { for (const auto &seqInfo : m_sequences) if (seqInfo.seqId == seqId) @@ -373,11 +373,14 @@ struct MBLayout { if (t > seq.GetNumTimeSteps()) LogicError("GetColumnIndex: t out of sequence bounds."); - ptrdiff_t tIn = (ptrdiff_t)t + seq.tBegin; + if (seq.s > GetNumParallelSequences()) + LogicError("GetColumnIndex: seq.s out of sequence bounds."); // can only happen if 'seq' does not come out of our own m_sequences array, which is verboten + ptrdiff_t tIn = (ptrdiff_t)t + seq.tBegin; // shifted time index if (tIn < 0 || (size_t)tIn >= GetNumTimeSteps()) LogicError("GetColumnIndex: Attempted to access a time step that is accessing a portion of a sequence that is not included in current minibatch."); // we may encounter this for truncated BPTT size_t col = (size_t)tIn * GetNumParallelSequences() + seq.s; - return (size_t)col; + assert(col < GetNumCols()); + return col; } private: diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp index a623d8da72bd..3a99c5f7fce5 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp @@ -67,10 +67,11 @@ static shared_ptr> CreateStandardNode(const std::wstri else if (nodeType == OperationNameOf(MinusNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(NegateNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(NoiseContrastiveEstimationNode)) return New>(forward<_Types>(_Args)...); + else if (nodeType == OperationNameOf(PackedIndexNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(PastValueNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(PerDimMeanVarNormalizationNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(PerDimMeanVarDeNormalizationNode)) return New>(forward<_Types>(_Args)...); - else if (nodeType == OperationNameOf(TransposeDimensionsNode)) return New>(forward<_Types>(_Args)...); + else if (nodeType == OperationNameOf(TransposeDimensionsNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(PlusNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(ReconcileMBLayoutNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(RectifiedLinearNode)) return New>(forward<_Types>(_Args)...); @@ -99,6 +100,7 @@ static shared_ptr> CreateStandardNode(const std::wstri // legacy names we also support for back compat of model-files else if (nodeType == L"ColumnElementTimes") return New>(forward<_Types>(_Args)...); else if (nodeType == L"Delay") return New>(forward<_Types>(_Args)...); + // TODO: DiagTimes is also an alias of ElementTimes; current separate implementation is unnecessary. else if (nodeType == L"PerDimMeanVarNormalizationNode") return New>(forward<_Types>(_Args)...); else if (nodeType == L"PerDimMeanVarDeNormalizationNode") return New>(forward<_Types>(_Args)...); else if (nodeType == L"RowElementTimes") return New>(forward<_Types>(_Args)...); diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp index 387ee95969fa..01d398e762b8 100644 --- a/Source/ComputationNetworkLib/ComputationNode.cpp +++ b/Source/ComputationNetworkLib/ComputationNode.cpp @@ -318,6 +318,8 @@ void ComputationNode::WriteMinibatchWithFormatting(FILE* f, size_t onl for (size_t s = 0; s < sequences.size(); s++) { const auto& seqInfo = sequences[s]; + if (seqInfo.seqId == GAP_SEQUENCE_ID) // nothing in gaps to print + continue; size_t tBegin = seqInfo.tBegin >= 0 ? seqInfo.tBegin : 0; size_t tEnd = seqInfo.tEnd <= width ? seqInfo.tEnd : width; @@ -370,7 +372,7 @@ void ComputationNode::WriteMinibatchWithFormatting(FILE* f, size_t onl fprintfOrDie(f, "%s", sampleSeparator.c_str()); if (j == jstop) { - fprintf(f, "... (%d more)", (int)(jend - jstop)); // 'nuff said + fprintf(f, "...+%d", (int)(jend - jstop)); // 'nuff said break; } for (size_t i = 0; i < iend; i++) @@ -379,7 +381,7 @@ void ComputationNode::WriteMinibatchWithFormatting(FILE* f, size_t onl fprintfOrDie(f, "%s", elementSeparator.c_str()); if (i == istop) { - fprintf(f, "..."); + fprintf(f, "...+%d", (int)(iend - istop)); break; } else if (formatChar == 'f') // print as real number diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp index 36e93d943f2b..3bef16b20063 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.cpp +++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp @@ -144,7 +144,7 @@ template buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType)indexSequence[t]; } // the result will be kept in CPUDEVICE, since most likely we will access it again in PackedIndexNode - Value().SetValue(outMBLayout->GetNumParallelSequences(), outMBLayout->GetNumTimeSteps(), CPUDEVICE, buf.data(), MatrixFormat::matrixFormatColMajor); + Value().SetValue(1, outMBLayout->GetNumCols(), CPUDEVICE, buf.data(), MatrixFormat::matrixFormatColMajor); } template @@ -173,4 +173,59 @@ template template class WhereNode; template class WhereNode; +template +/*virtual*/ void PackedIndexNode::ForwardPropNonLooping() /*override*/ +{ + let& targetMBLayout = Input(TARGETDATA)->GetMBLayout(); // only used for index conversion + let& indexMBLayout = Input(INDEXDATA)->GetMBLayout(); + let& index = Input(INDEXDATA)->Value(); // per-seq index values that are to be mapped + auto& result = Value(); // packed index values as mapped to targetData's layout + // loop over targetSequences + // Input matrix contains time indices for each sequence that refer to frames inside that sequence. + // We replace every per-sequence index by the resolved column index w.r.t. the same MBLayout. + let& targetSequences = targetMBLayout->GetAllSequences(); + for (size_t i = 0; i < targetSequences.size(); i++) + { + let& targetSeq = targetSequences[i]; + if (targetSeq.seqId == GAP_SEQUENCE_ID) + continue; + let& indexSeq = indexMBLayout->FindSequence(targetSeq.seqId); // find corresponding entry in indexMBLayout + for (size_t tIndex = 0; tIndex < indexSeq.GetNumTimeSteps(); tIndex++) // map all index values in index sequence + { + let jIndex = indexMBLayout->GetColumnIndex(indexSeq, tIndex); // map time index to actual location in the matrix storage object + let tTarget = (size_t)index(0, jIndex); // the new time location (relative to target sequence) + let jTarget = targetMBLayout->GetColumnIndex(targetSeq, tTarget); // map new time index as well. This performs a range check. + result(0, jIndex) = (ElemType)jTarget; + } + } +} + +template +/*virtual*/ void PackedIndexNode::BackpropToNonLooping(size_t /*inputIndex*/) /*override*/ +{ + // we cannot backprop through a condition + // Can we? + return; +} + +template +/*virtual*/ void PackedIndexNode::Validate(bool isFinalValidationPass) /*override*/ +{ + ComputationNodeBase::Validate(isFinalValidationPass); + + // inherit both MBLayout and sample dimension (scalar) from indexData + // Because we map (per-seq) index sequence to (packed) index sequence. Target is only for index calculation. + m_pMBLayout = Input(INDEXDATA)->GetMBLayout(); + if (isFinalValidationPass && (!Input(INDEXDATA)->HasMBLayout() || !Input(TARGETDATA)->HasMBLayout())) + LogicError("%ls %ls operation requires both inputs to be minibatch data (must have MBLayouts).", NodeName().c_str(), OperationName().c_str()); + + if (isFinalValidationPass && Input(INDEXDATA)->GetSampleLayout().GetNumElements() != 1) + InvalidArgument("%ls %ls operation requires the second argument (indexData) to be a scalar sequence.", NodeName().c_str(), OperationName().c_str()); + + SetDims(Input(INDEXDATA)); +} + +template class PackedIndexNode; +template class PackedIndexNode; + }}} diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h index 273d8260ea9f..89faa76e01b8 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.h +++ b/Source/ComputationNetworkLib/ReshapingNodes.h @@ -572,7 +572,7 @@ template class RowRepeatNode; template class RowRepeatNode; // ----------------------------------------------------------------------- -// WhereNode -- extract indices of non-0 values in a sequence +// WhereNode(cond) -- extract indices of non-0 values in a sequence // As this implies a runtime-vale dependent reduction in dimension, it can // only be applied to time sequences, and not other tensor dimensions. // The result will have a different MBLayout reflecting the shortened result sequences. @@ -605,6 +605,41 @@ class WhereNode : public ComputationNodeNonLooping, public NumInputs<1 std::vector> m_placementBuffer; // [sequenceIndex] assigned location for a sequence }; +// ----------------------------------------------------------------------- +// PackedIndexNode(targetObject, indexSequence) -- convert sequence indices +// to internal packed column indices w.r.t. targetObject. +// Intended use is +// - Gather (cond, x) = GatherPacked (PackedIndex (x, Where (xCond)), x) +// - Scatter (cond, y) = ScatterPacked (PackedIndex (y, Where (yCond)), y) +// This maps sequence-specific time indices t to GetColumnIndex(seq,t), +// as input for subsequent GatherPacked() or ScatterPacked() operations. +// ----------------------------------------------------------------------- + +template +class PackedIndexNode : public ComputationNodeNonLooping, public NumInputs<2> +{ + typedef ComputationNodeNonLooping Base; UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() { return L"PackedIndex"; } + + // our inputs + static const size_t TARGETDATA = 0; + static const size_t INDEXDATA = 1; + +public: + DeclareConstructorFromConfigWithNumInputs(PackedIndexNode); + PackedIndexNode(DEVICEID_TYPE deviceId, const wstring& name) : + Base(deviceId, name) + { + m_learningRateMultiplier = 0.0f; // we cannot backprop; this will disable it + // TODO: This ^^ is a bit of a hack. Do we need a better mechanism for nodes to tell that they cannot backprop? We will have more of those. + // This might even not work, need to track down how this is inferred/propagated upwards. It is really only for LearnableParameters. + } + + virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override; + virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override; + virtual void Validate(bool isFinalValidationPass) override; +}; + // ----------------------------------------------------------------------- // DiagonalNode -- extract diagonal elements of a square matrix into a row vector // ----------------------------------------------------------------------- From 2575da305ad1bd297fdaca02b08fe1d3de849913 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Sat, 12 Mar 2016 16:42:32 -0800 Subject: [PATCH 11/26] implemented GatherPackedNode and ScatterPackedNode, so far CPU only --- .../CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs | 25 ++- Source/Common/Include/Sequences.h | 61 +++++- .../ComputationNetworkBuilder.cpp | 2 + .../ComputationNetworkLib/ComputationNode.h | 6 + .../ComputationNetworkLib/ReshapingNodes.cpp | 181 +++++++++++------- Source/ComputationNetworkLib/ReshapingNodes.h | 111 ++++++++--- Source/Math/CPUMatrix.cpp | 101 +++++++++- Source/Math/CPUMatrix.h | 20 +- Source/Math/Matrix.cpp | 78 ++++++-- Source/Math/Matrix.h | 5 +- 10 files changed, 452 insertions(+), 138 deletions(-) diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs index 9ae9ef974646..ece5ffd5412f 100644 --- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs +++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs @@ -61,6 +61,7 @@ AveragePooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSu ColumnwiseCrossProduct = KhatriRaoProduct // deprecated ClassificationError = ErrorPrediction Delay = PastValue + BatchNormalization(input, scale, bias, runMean, runInvStdDev, eval, spatial, normalizationTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ] ClassBasedCrossEntropyWithSoftmax(labelClassDescriptorVectorSequence, mainInputInfo, mainWeight, classLogProbsBeforeSoftmax, tag='') = new ComputationNode [ operation = 'ClassBasedCrossEntropyWithSoftmax' ; inputs = (labelClassDescriptorVectorSequence : mainInputInfo : mainWeight : classLogProbsBeforeSoftmax) /*plus the function args*/ ] ColumnElementTimes(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'ColumnElementTimes' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ] @@ -76,6 +77,7 @@ Dropout(activationVectorSequence, tag='') = new ComputationNode [ operation = 'D ElementTimes(aMatrix, anotherMatrix, tag='') = new ComputationNode [ operation = 'ElementTimes' ; inputs = (aMatrix : anotherMatrix) /*plus the function args*/ ] ErrorPrediction(labelVectorSequence, outVectorSequence, tag='') = new ComputationNode [ operation = 'ErrorPrediction' ; inputs = (labelVectorSequence : outVectorSequence) /*plus the function args*/ ] Exp(x, tag='') = new ComputationNode [ operation = 'Exp' ; inputs = x /*plus the function args*/ ] +GatherPacked(indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'GatherPacked' ; inputs = (indexSequence : sourceData) /*plus the function args*/ ] GMMLogLikelihood(unnormalizedPriorVector, meansAsRows, logStdDevAsRows, dataVectorSequence, tag='') = new ComputationNode [ operation = 'GMMLogLikelihood' ; inputs = (unnormalizedPriorVector : meansAsRows : logStdDevAsRows : dataVectorSequence) /*plus the function args*/ ] InvStdDev(dataVectorSequence, tag='') = new ComputationNode [ operation = 'InvStdDev' ; inputs = dataVectorSequence /*plus the function args*/ ] KhatriRaoProduct(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'KhatriRaoProduct' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ] @@ -93,6 +95,7 @@ Plus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'Plus' RectifiedLinear(z, tag='') = new ComputationNode [ operation = 'RectifiedLinear' ; inputs = z /*plus the function args*/ ] Scale(scalarScalingFactor, matrix, tag='') = new ComputationNode [ operation = 'Scale' ; inputs = (scalarScalingFactor : matrix) /*plus the function args*/ ] // TODO: Scale = ElementTimes +ScatterPacked(cond, indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'ScatterPacked' ; inputs = (cond : indexSequence : sourceData) /*plus the function args*/ ] Sigmoid(z, tag='') = new ComputationNode [ operation = 'Sigmoid' ; inputs = z /*plus the function args*/ ] Softmax(z, tag='') = new ComputationNode [ operation = 'Softmax' ; inputs = z /*plus the function args*/ ] Hardmax(z, tag='') = new ComputationNode [ operation = 'Hardmax' ; inputs = z /*plus the function args*/ ] @@ -185,19 +188,21 @@ Sequences = [ Map (lambda, x) = lambda (x) // that one's easy # Reverse (x) is a C++ node currently called TimeReverse - Filter (pred, x) = x // TODO: Implement this as a C++ node. + # Gather and Scatter + # We go through 3 nodes each to take advantage of x + Gather (cond, x) = GatherPacked ( PackedIndex (x, Where (cond)), x) + Scatter (cond, y) = ScatterPacked (cond, PackedIndex (y, Where (cond)), y) # sequence-altering LINQ-like operators # These generate new data packing (MBLayouts) # TakeWhile and DropWhile TakeWhile (predicate, x) = Filter ( _WhilePredicate (PastValue, predicate), x) - DropWhile (predicate, x) = Filter (!_WhilePredicate (PastValue, predicate), x) - # Skip, SkipWhile--same? + SkipWhile (predicate, x) = Filter (!_WhilePredicate (PastValue, predicate), x) _WhilePredicate (DelayFn, predicate, input) = [ - whilePredicate = Boolean.And (DelayFn (whilePredicate, defaultHiddenActivation=Boolean.True), predicate) - ].whilePredicate + whilePredicateRec = Boolean.And (DelayFn (whilePredicateRec, defaultHiddenActivation=Boolean.True), predicate) + ].whilePredicateRec # TODO: do we need operations from the back? # First and Take @@ -206,14 +211,18 @@ Sequences = [ Take (N, x) = _Take (PastValue, N, x) _Take (DelayFn, N, x) = [ selected = Loop._IsWithin (DelayFn, N, x) - out = Filter (selected, x) + out = Gather (selected, x) ].out Skip (N, x) = _Skip (PastValue, N, x) _Skip (DelayFn, N, x) = [ // TODO: merge with _Take selected = Loop._IsWithin (DelayFn, N, x) - out = Filter (!selected, x) + out = Gather (!selected, x) ].out - ElementAt (n, x) = First (Skip (n, x)) // not efficient, as it filters twice. Better AND the predicates. TODO: what if n is out of range? ElementAtOrDefault + ElementAt (n, x) = [ // not efficient, as it filters twice. Better AND the predicates. TODO: what if n is out of range? ElementAtOrDefault + startMask = Skip (n, x) // ...000111... + mask = startMask - PastValue (0, startMask) // ...000100... + out = Gather (mask, x) + ] Single (predicate, x) = x #FirstOrDefault (x) = ? // can empty sequences exist or even be represented by CNTK? diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h index 72b2c02ffdd7..829c54c173ff 100644 --- a/Source/Common/Include/Sequences.h +++ b/Source/Common/Include/Sequences.h @@ -175,6 +175,60 @@ struct MBLayout m_writable = true; } + // packing algorithm + // - width: maximum width of structure; set to maximum over sequence lengths + // - inputSequences: vector of input SequenceInfo records (only seqId and GetNumTimeSteps() are used) + // - [out] *pMBLayout: MBLayout that describes the created packed sequence set + // - placement, rowAllocations: temp buffers (passed in to be able to optimize memory allocations) + template + void InitAsPackedSequences(const SequenceInfoVector& inputSequences, + /*temp buffer*/std::vector>& placement, + /*temp buffer*/std::vector rowAllocations) + { + placement.resize(inputSequences.size()); // [sequence index] result goes here (entries are invalid for gaps) + // determine width of MBLayout + size_t width = 0; + for (size_t i = 0; i < inputSequences.size(); i++) + if (inputSequences[i].seqId == GAP_SEQUENCE_ID) + continue; + else if (width < inputSequences[i].GetNumTimeSteps()) + width = inputSequences[i].GetNumTimeSteps(); + // allocate + rowAllocations.clear(); // [row] we build rows one by one + for (size_t i = 0; i < inputSequences.size(); i++) + { + if (inputSequences[i].seqId == GAP_SEQUENCE_ID) + continue; + let len = inputSequences[i].GetNumTimeSteps(); + // first see if we find a row that has enough space + // TODO: Should we use a proper priority_queue? + size_t s; + for (s = 0; s < rowAllocations.size(); s++) + if (rowAllocations[s] + len <= width) + break; // yep, it fits + // we did not find a s that fit then create a new one + if (s == rowAllocations.size()) + rowAllocations.push_back(0); + // sequence goes to (s, rowAllocations[s]) + placement[i] = make_pair(s, rowAllocations[s]); + // and allocate it + rowAllocations[s] += len; + } + // create MBLayout + Init(rowAllocations.size(), width); + for (size_t i = 0; i < inputSequences.size(); i++) + { + if (inputSequences[i].seqId == GAP_SEQUENCE_ID) + continue; + size_t s, tBegin; tie + (s, tBegin) = placement[i]; + AddSequence(inputSequences[i].seqId, s, (ptrdiff_t)tBegin, tBegin + inputSequences[i].GetNumTimeSteps()); + } + // need to fill the gaps as well + for (size_t s = 0; s < rowAllocations.size(); s++) + AddGap(s, (size_t)rowAllocations[s], width); + } + // ------------------------------------------------------------------- // accessors // ------------------------------------------------------------------- @@ -1003,7 +1057,7 @@ static inline std::pair TensorSliceWithMBLayou // 'Reduce' style operations--the criterion nodes and gradient computation--call this. // Warning: The layout used here must match the matrix. E.g. don't pass a child's matrix from a criterion node (use Input(x)->MaskMissing{Values,Gradient}ColumnsToZero() instead. template -static inline void MaskMissingColumnsTo(Matrix &matrixToMask, const MBLayoutPtr &pMBLayout, const FrameRange &fr, ElemType val) +static inline void MaskMissingColumnsTo(Matrix& matrixToMask, const MBLayoutPtr& pMBLayout, const FrameRange& fr, ElemType val) { if (pMBLayout && pMBLayout->HasGaps(fr)) { @@ -1013,11 +1067,12 @@ static inline void MaskMissingColumnsTo(Matrix &matrixToMask, const MB auto matrixSliceToMask = DataWithMBLayoutFor(matrixToMask, fr, pMBLayout); TensorView(matrixSliceToMask).DoMaskNegativeOf(0, TensorView(matrixSliceToMask), TensorView(maskSlice), 1); val; #else - const auto &maskMatrix = pMBLayout->GetColumnsValidityMask(matrixToMask.GetDeviceId()); + const auto& maskMatrix = pMBLayout->GetColumnsValidityMask(matrixToMask.GetDeviceId()); auto maskSlice = DataWithMBLayoutFor(maskMatrix, fr, pMBLayout); auto matrixSliceToMask = DataWithMBLayoutFor(matrixToMask, fr, pMBLayout); matrixSliceToMask.MaskColumnsValue(maskSlice, val); #endif } } -} } } + +}}} diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp index 3a99c5f7fce5..2fed249ca72a 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp @@ -52,6 +52,7 @@ static shared_ptr> CreateStandardNode(const std::wstri else if (nodeType == OperationNameOf(ErrorPredictionNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(ExpNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(FutureValueNode)) return New>(forward<_Types>(_Args)...); + else if (nodeType == OperationNameOf(GatherPackedNode)) return New>(forward<_Types>(_Args)...); #ifdef COMING_SOON else if (nodeType == OperationNameOf(GMMLogLikelihoodNode)) return New>(forward<_Types>(_Args)...); #endif @@ -79,6 +80,7 @@ static shared_ptr> CreateStandardNode(const std::wstri else if (nodeType == OperationNameOf(RowRepeatNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(RowSliceNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(RowStackNode)) return New>(forward<_Types>(_Args)...); + else if (nodeType == OperationNameOf(ScatterPackedNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(SequenceWithSoftmaxNode)) return New>(forward<_Types>(_Args)...); #ifdef COMING_SOON else if (nodeType == OperationNameOf(SequenceDecoderNode)) return New>(forward<_Types>(_Args)...); diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index b42e1098ca17..a7a9c521db22 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -1057,6 +1057,12 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot MaskMissingColumnsToZero(*m_gradient, m_pMBLayout, fr); } + // for index vectors: Invalid entries must be set to -1. + void MaskMissingValueColumnsTo(const FrameRange& fr, ElemType val) + { + MaskMissingColumnsTo(*m_value, m_pMBLayout, fr, val); + } + // for debugging, set the gaps to NaN instead (to track whether it bubbles up somewhere) void InvalidateMissingValueColumns(const FrameRange& fr) override final { diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp index 3bef16b20063..eaf2217057a0 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.cpp +++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp @@ -27,61 +27,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Where(bitVector) -- extract indices of non-0 values in a sequence // ----------------------------------------------------------------------- -// TODO: move to MBLayout as a static method -// packing algorithm -// - width: maximum width of structure; set to maximum over sequence lengths -// - inputSequences: vector of input SequenceInfo records (only seqId and GetNumTimeSteps() are used) -// - [out] *pMBLayout: MBLayout that describes the created packed sequence set -// - placement, rowAllocations: temp buffers (passed in to be able to optimize memory allocations) -template -static void PackSequences(const SequenceInfoVector& inputSequences, - /*ref->out*/MBLayoutPtr pMBLayout, - /*temp buffer*/std::vector>& placement, - /*temp buffer*/std::vector rowAllocations) -{ - placement.resize(inputSequences.size()); // [sequence index] result goes here (entries are invalid for gaps) - // determine width of MBLayout - size_t width = 0; - for (size_t i = 0; i < inputSequences.size(); i++) - if (inputSequences[i].seqId == GAP_SEQUENCE_ID) - continue; - else if (width < inputSequences[i].GetNumTimeSteps()) - width = inputSequences[i].GetNumTimeSteps(); - // allocate - rowAllocations.clear(); // [row] we build rows one by one - for (size_t i = 0; i < inputSequences.size(); i++) - { - if (inputSequences[i].seqId == GAP_SEQUENCE_ID) - continue; - let len = inputSequences[i].GetNumTimeSteps(); - // first see if we find a row that has enough space - size_t s; - for (s = 0; s < rowAllocations.size(); s++) - if (rowAllocations[s] + len <= width) - break; // yep, it fits - // we did not find a s that fit then create a new one - if (s == rowAllocations.size()) - rowAllocations.push_back(0); - // sequence goes to (s, rowAllocations[s]) - placement[i] = make_pair(s, rowAllocations[s]); - // and allocate it - rowAllocations[s] += len; - } - // create MBLayout - pMBLayout->Init(rowAllocations.size(), width); - for (size_t i = 0; i < inputSequences.size(); i++) - { - if (inputSequences[i].seqId == GAP_SEQUENCE_ID) - continue; - size_t s, tBegin; tie - (s, tBegin) = placement[i]; - pMBLayout->AddSequence(inputSequences[i].seqId, s, (ptrdiff_t)tBegin, tBegin + inputSequences[i].GetNumTimeSteps()); - } - // need to fill the gaps as well - for (size_t s = 0; s < rowAllocations.size(); s++) - pMBLayout->AddGap(s, (size_t)rowAllocations[s], width); -} - // wrapper class to pass MBLayout sequence vector to PackSequences() struct SequenceLengthVector { @@ -131,7 +76,7 @@ template } // create a new MBLayout let& outMBLayout = GetMBLayout(); - PackSequences(SequenceLengthVector(sequences, indexSequences), outMBLayout, /*temp*/m_placementBuffer, /*temp*/m_rowAllocationsBuffer); + outMBLayout->InitAsPackedSequences(SequenceLengthVector(sequences, indexSequences), /*temp*/m_placementBuffer, /*temp*/m_rowAllocationsBuffer); // copy to output vector buf(outMBLayout->GetNumCols(), numeric_limits::quiet_NaN()); // STL cannot easily avoid initializing, so we might as well init with NaN for gaps for (size_t i = 0; i < sequences.size(); i++) @@ -173,29 +118,33 @@ template template class WhereNode; template class WhereNode; +// ----------------------------------------------------------------------- +// PackedIndexNode(targetObject, indexSequence) -- map sequence +// ----------------------------------------------------------------------- + template /*virtual*/ void PackedIndexNode::ForwardPropNonLooping() /*override*/ { - let& targetMBLayout = Input(TARGETDATA)->GetMBLayout(); // only used for index conversion + let& sourceMBLayout = Input(SOURCEDATA)->GetMBLayout(); // only used for index conversion let& indexMBLayout = Input(INDEXDATA)->GetMBLayout(); let& index = Input(INDEXDATA)->Value(); // per-seq index values that are to be mapped - auto& result = Value(); // packed index values as mapped to targetData's layout - // loop over targetSequences + auto& result = Value(); // packed index values as mapped to sourceData's layout + // loop over sourceSequences // Input matrix contains time indices for each sequence that refer to frames inside that sequence. // We replace every per-sequence index by the resolved column index w.r.t. the same MBLayout. - let& targetSequences = targetMBLayout->GetAllSequences(); - for (size_t i = 0; i < targetSequences.size(); i++) + let& sourceSequences = sourceMBLayout->GetAllSequences(); + for (size_t i = 0; i < sourceSequences.size(); i++) { - let& targetSeq = targetSequences[i]; - if (targetSeq.seqId == GAP_SEQUENCE_ID) + let& sourceSeq = sourceSequences[i]; + if (sourceSeq.seqId == GAP_SEQUENCE_ID) continue; - let& indexSeq = indexMBLayout->FindSequence(targetSeq.seqId); // find corresponding entry in indexMBLayout + let& indexSeq = indexMBLayout->FindSequence(sourceSeq.seqId); // find corresponding entry in indexMBLayout for (size_t tIndex = 0; tIndex < indexSeq.GetNumTimeSteps(); tIndex++) // map all index values in index sequence { let jIndex = indexMBLayout->GetColumnIndex(indexSeq, tIndex); // map time index to actual location in the matrix storage object - let tTarget = (size_t)index(0, jIndex); // the new time location (relative to target sequence) - let jTarget = targetMBLayout->GetColumnIndex(targetSeq, tTarget); // map new time index as well. This performs a range check. - result(0, jIndex) = (ElemType)jTarget; + let tSource = (size_t)index(0, jIndex); // the new time location (relative to source sequence) + let jSource = sourceMBLayout->GetColumnIndex(sourceSeq, tSource); // map new time index as well. This performs a range check. + result(0, jIndex) = (ElemType)jSource; } } } @@ -216,7 +165,7 @@ template // inherit both MBLayout and sample dimension (scalar) from indexData // Because we map (per-seq) index sequence to (packed) index sequence. Target is only for index calculation. m_pMBLayout = Input(INDEXDATA)->GetMBLayout(); - if (isFinalValidationPass && (!Input(INDEXDATA)->HasMBLayout() || !Input(TARGETDATA)->HasMBLayout())) + if (isFinalValidationPass && (!Input(INDEXDATA)->HasMBLayout() || !Input(SOURCEDATA)->HasMBLayout())) LogicError("%ls %ls operation requires both inputs to be minibatch data (must have MBLayouts).", NodeName().c_str(), OperationName().c_str()); if (isFinalValidationPass && Input(INDEXDATA)->GetSampleLayout().GetNumElements() != 1) @@ -228,4 +177,100 @@ template template class PackedIndexNode; template class PackedIndexNode; +// ----------------------------------------------------------------------- +// GatherPackedNode(packedIndex, sourceData) -- gather operation +// ----------------------------------------------------------------------- + +template +/*virtual*/ void GatherPackedNode::ForwardPropNonLooping() /*override*/ +{ + Input(INDEXDATA)->MaskMissingValueColumnsTo(FrameRange(Input(INDEXDATA)->GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter + let& index = Input(INDEXDATA)->Value(); // column indices to copy from + let& source = Input(SOURCEDATA)->Value(); // source data to copy + auto& output = Value(); // output goes here + output.DoGatherColumnsOf(/*beta=*/0, index, source, /*alpha=*/1); +} + +template +/*virtual*/ void GatherPackedNode::BackpropToNonLooping(size_t inputIndex) /*override*/ +{ + if (inputIndex == SOURCEDATA) + { + let& index = Input(INDEXDATA)->Value(); // column indices to copy from + auto& sourceGradient = Input(SOURCEDATA)->Gradient(); // source to propagate the gradient intpu + auto& outputGradient = Gradient(); // output gradient to propagate + sourceGradient.DoScatterColumnsOf(/*beta=*/1, index, outputGradient, /*alpha=*/1); + } +} + +template +/*virtual*/ void GatherPackedNode::Validate(bool isFinalValidationPass) /*override*/ +{ + ComputationNodeBase::Validate(isFinalValidationPass); + + // inherit MBLayout from indexData + m_pMBLayout = Input(INDEXDATA)->GetMBLayout(); + if (isFinalValidationPass && (!Input(INDEXDATA)->HasMBLayout() || !Input(SOURCEDATA)->HasMBLayout())) + LogicError("%ls %ls operation requires both inputs to be minibatch data (must have MBLayouts).", NodeName().c_str(), OperationName().c_str()); + + if (isFinalValidationPass && Input(INDEXDATA)->GetSampleLayout().GetNumElements() != 1) + InvalidArgument("%ls %ls operation requires the first argument (indexData) to be a scalar sequence.", NodeName().c_str(), OperationName().c_str()); + + // inherit tensor dimension from sourceData + SetDims(Input(SOURCEDATA)); +} + +template class GatherPackedNode; +template class GatherPackedNode; + +// ----------------------------------------------------------------------- +// ScatterPackedNode(layoutData, packedIndex, sourceData) -- scatter operation +// ----------------------------------------------------------------------- + +template +/*virtual*/ void ScatterPackedNode::ForwardPropNonLooping() /*override*/ +{ + if (*Input(INDEXDATA)->GetMBLayout() != *Input(SOURCEDATA)->GetMBLayout()) + InvalidArgument("%ls %ls operation requires the minibatch layout of index and source data to be the same.", NodeName().c_str(), OperationName().c_str()); + Input(INDEXDATA)->MaskMissingValueColumnsTo(FrameRange(Input(INDEXDATA)->GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter + let& index = Input(INDEXDATA)->Value(); // column indices to copy from + let& source = Input(SOURCEDATA)->Value(); // source data to copy + auto& output = Value(); // output goes here + output.DoScatterColumnsOf(/*beta=*/0, index, source, /*alpha=*/1); +} + +template +/*virtual*/ void ScatterPackedNode::BackpropToNonLooping(size_t inputIndex) /*override*/ +{ + if (inputIndex == SOURCEDATA) + { + let& index = Input(INDEXDATA)->Value(); // column indices to copy from + auto& sourceGradient = Input(SOURCEDATA)->Gradient(); // source to propagate the gradient input + auto& outputGradient = Gradient(); // output gradient to propagate + sourceGradient.DoGatherColumnsOf(/*beta=*/1, index, outputGradient, /*alpha=*/1); + } +} + +template +/*virtual*/ void ScatterPackedNode::Validate(bool isFinalValidationPass) /*override*/ +{ + ComputationNodeBase::Validate(isFinalValidationPass); + + // inherit MBLayout from layoutData (that's the only thing we use it for) + m_pMBLayout = Input(LAYOUTDATA)->GetMBLayout(); + if (isFinalValidationPass && (!Input(LAYOUTDATA)->HasMBLayout() || !Input(INDEXDATA)->HasMBLayout() || !Input(SOURCEDATA)->HasMBLayout())) + LogicError("%ls %ls operation requires all inputs to be minibatch data (must have MBLayouts).", NodeName().c_str(), OperationName().c_str()); + + if (isFinalValidationPass && Input(INDEXDATA)->GetSampleLayout().GetNumElements() != 1) + InvalidArgument("%ls %ls operation requires the second argument (indexData) to be a scalar sequence.", NodeName().c_str(), OperationName().c_str()); + + // TODO: We also know that indexData and sourceData must have the same MBLayout. But that is checked at runtime. + + // inherit tensor dimension from sourceData + SetDims(Input(SOURCEDATA)); +} + +template class ScatterPackedNode; +template class ScatterPackedNode; + }}} diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h index 89faa76e01b8..dbade27997b3 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.h +++ b/Source/ComputationNetworkLib/ReshapingNodes.h @@ -46,12 +46,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { template class ReshapeNode : public UnaryElementWiseNode { - typedef UnaryElementWiseNode Base; - UsingUnaryElementwiseNodeBaseMembers; - static const std::wstring TypeName() - { - return L"Reshape"; - } + typedef UnaryElementWiseNode Base; UsingUnaryElementwiseNodeBaseMembers; + static const std::wstring TypeName() { return L"Reshape"; } public: ReshapeNode(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& replacementSampleLayout = TensorShape(), int beginDim = 1, int endDim = 0) @@ -185,12 +181,8 @@ template class ReshapeNode; template class ReconcileMBLayoutNode : public ComputationNode, public NumInputs<2> { - typedef ComputationNode Base; - UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() - { - return L"ReconcileMBLayout"; - } + typedef ComputationNode Base; UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() { return L"ReconcileMBLayout"; } public: DeclareConstructorFromConfigWithNumInputs(ReconcileMBLayoutNode); @@ -241,13 +233,13 @@ template class ReconcileMBLayoutNode; // ----------------------------------------------------------------------- // RowSliceNode (input) // This node extracts a slice of the first tensor dimension (row). +// TODO: Extend to specifying the axis. Time slicing would have to be done in BrainScript using Gather. // ----------------------------------------------------------------------- template class RowSliceNode : public ComputationNode, public NumInputs<1> { - typedef ComputationNode Base; - UsingComputationNodeMembersBoilerplate; + typedef ComputationNode Base; UsingComputationNodeMembersBoilerplate; static const std::wstring TypeName() { return L"RowSlice"; } public: @@ -351,8 +343,7 @@ template class RowSliceNode; template class RowStackNode : public ComputationNode // note: not deriving from NumInputs<> like most other nodes, because this one takes a variable number of inputs { - typedef ComputationNode Base; - UsingComputationNodeMembersBoilerplate; + typedef ComputationNode Base; UsingComputationNodeMembersBoilerplate; static const std::wstring TypeName() { return L"RowStack"; } public: @@ -492,12 +483,8 @@ template class RowStackNode; template class RowRepeatNode : public ComputationNode, public NumInputs<1> { - typedef ComputationNode Base; - UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() - { - return L"RowRepeat"; - } + typedef ComputationNode Base; UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() { return L"RowRepeat"; } public: RowRepeatNode(DEVICEID_TYPE deviceId, const wstring& name, size_t numRepeats = 1) @@ -589,13 +576,12 @@ class WhereNode : public ComputationNodeNonLooping, public NumInputs<1 WhereNode(DEVICEID_TYPE deviceId, const wstring& name) : Base(deviceId, name) { - m_learningRateMultiplier = 0.0f; // we cannot backprop; this will disable it - // TODO: This ^^ is a bit of a hack. Do we need a better mechanism for nodes to tell that they cannot backprop? We will have more of those. - // This might even not work, need to track down how this is inferred/propagated upwards. It is really only for LearnableParameters. } virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override; virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override; + virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } virtual void Validate(bool isFinalValidationPass) override; private: @@ -610,7 +596,7 @@ class WhereNode : public ComputationNodeNonLooping, public NumInputs<1 // to internal packed column indices w.r.t. targetObject. // Intended use is // - Gather (cond, x) = GatherPacked (PackedIndex (x, Where (xCond)), x) -// - Scatter (cond, y) = ScatterPacked (PackedIndex (y, Where (yCond)), y) +// - Scatter (cond, y) = ScatterPacked (yCond, PackedIndex (y, Where (yCond)), y) // This maps sequence-specific time indices t to GetColumnIndex(seq,t), // as input for subsequent GatherPacked() or ScatterPacked() operations. // ----------------------------------------------------------------------- @@ -622,7 +608,7 @@ class PackedIndexNode : public ComputationNodeNonLooping, public NumIn static const std::wstring TypeName() { return L"PackedIndex"; } // our inputs - static const size_t TARGETDATA = 0; + static const size_t SOURCEDATA = 0; static const size_t INDEXDATA = 1; public: @@ -630,13 +616,78 @@ class PackedIndexNode : public ComputationNodeNonLooping, public NumIn PackedIndexNode(DEVICEID_TYPE deviceId, const wstring& name) : Base(deviceId, name) { - m_learningRateMultiplier = 0.0f; // we cannot backprop; this will disable it - // TODO: This ^^ is a bit of a hack. Do we need a better mechanism for nodes to tell that they cannot backprop? We will have more of those. - // This might even not work, need to track down how this is inferred/propagated upwards. It is really only for LearnableParameters. } virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override; virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override; + virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } + virtual void Validate(bool isFinalValidationPass) override; +}; + +// ----------------------------------------------------------------------- +// GatherPackedNode(packedIndex, sourceData) -- gather operation +// Copies subset of samples pointed to by packedIndex from sourceData. +// Sequence lengths are equal to those from packedIndex. +// PackedIndex must have been created with PackedIndex() node, and is +// otherwise opaque to users. +// ----------------------------------------------------------------------- + +template +class GatherPackedNode : public ComputationNodeNonLooping, public NumInputs<2> +{ + typedef ComputationNodeNonLooping Base; UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() { return L"GatherPacked"; } + + // our inputs + static const size_t INDEXDATA = 0; + static const size_t SOURCEDATA = 1; + +public: + DeclareConstructorFromConfigWithNumInputs(GatherPackedNode); + GatherPackedNode(DEVICEID_TYPE deviceId, const wstring& name) : + Base(deviceId, name) + { + } + + virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override; + virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t inputIndex) override; + virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } + virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override { return childIndex == INDEXDATA; } + virtual void Validate(bool isFinalValidationPass) override; +}; + +// ----------------------------------------------------------------------- +// ScatterPackedNode(layoutData, packedIndex, sourceData) -- scatter operation +// Copies sourceData to sample positions pointed to by packedIndex. +// The first arg, 'layoutData', is used only to determine sequence lengths, +// and should be the same that was used to Where(). +// PackedIndex must have been created with PackedIndex() node, and is +// otherwise opaque to users. +// ----------------------------------------------------------------------- + +template +class ScatterPackedNode : public ComputationNodeNonLooping, public NumInputs<3> +{ + typedef ComputationNodeNonLooping Base; UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() { return L"ScatterPacked"; } + + // our inputs + static const size_t LAYOUTDATA = 0; + static const size_t INDEXDATA = 1; + static const size_t SOURCEDATA = 2; + +public: + DeclareConstructorFromConfigWithNumInputs(ScatterPackedNode); + ScatterPackedNode(DEVICEID_TYPE deviceId, const wstring& name) : + Base(deviceId, name) + { + } + + virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override; + virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t inputIndex) override; + virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } + virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override { return childIndex == INDEXDATA; } virtual void Validate(bool isFinalValidationPass) override; }; diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp index e7b20d426e4d..6b411dc4929a 100644 --- a/Source/Math/CPUMatrix.cpp +++ b/Source/Math/CPUMatrix.cpp @@ -677,6 +677,85 @@ CPUMatrix& CPUMatrix::AssignTransposeOf(const CPUMatrix +static void ScaleAndAddColumn(ElemType beta, ElemType* dst, const ElemType* src, size_t numRows, ElemType alpha) +{ + if (alpha != 1) // rare case: just do the full thing + for (size_t i = 0; i < numRows; i++) + dst[i] = beta * dst[i] + alpha * src[i]; + else if (beta == 1) // used in backprop + for (size_t i = 0; i < numRows; i++) + dst[i] += src[i]; + else if (beta == 0) // plain assignment + memcpy(dst, src, sizeof(ElemType) * numRows); + else // alpha=1, arbitrary beta: also rare case + for (size_t i = 0; i < numRows; i++) + dst[i] = beta * dst[i] + src[i]; +} + +// *this[:,j] = a[:,m[j]] * alpha + *this[:,j] * beta +template +CPUMatrix& CPUMatrix::DoGatherColumnsOf(ElemType beta, const CPUMatrix& m, const CPUMatrix& a, ElemType alpha) +{ + if (m.GetNumRows() != 1) // index is 1-dimensional only + InvalidArgument("DoGatherColumnsOf: Map must be a row vector."); + + if (beta) + VerifySize(a.GetNumRows(), m.GetNumCols()); + else + Resize(a.GetNumRows(), m.GetNumCols()); + + auto& us = *this; +//#pragma omp parallel for // TODO: Depending in circumstance, it may be more efficient to parallelize over rows. + foreach_column(jOut, us) + { + auto jInF = m(0, jOut); // this is the column we need to get + if (jInF < 0) // negative index means gap + continue; + size_t jIn = (size_t)jInF; + if (jIn >= a.GetNumCols()) + InvalidArgument("DoGatherColumnsOf: Map out of bounds."); + ScaleAndAddColumn(beta, &us(0,jOut), &a(0,jIn), us.GetNumRows(), alpha); + } + + return *this; +} + +// *this[:,m[j]] = a[:,j] * alpha + *this[:,m[j]] * beta +template +CPUMatrix& CPUMatrix::DoScatterColumnsOf(ElemType beta, const CPUMatrix& m, const CPUMatrix& a, ElemType alpha) +{ + if (m.GetNumRows() != 1) // index is 1-dimensional only + InvalidArgument("DoScatterColumnsOf: Map must be a row vector."); + if (m.GetNumCols() != a.GetNumCols()) + InvalidArgument("DoScatterColumnsOf: Map must have width of input vector."); + if (a.GetNumRows() != GetNumRows()) + InvalidArgument("DoScatterColumnsOf: Output must have same height as input vector."); + + auto& us = *this; + + // pre-scale with beta upfront + // Scatter may add more than one source column to the same target, so we must pre-scale with beta, and then just keep adding. + Scale(beta, us); // if beta is 0, then this will be a memset() + +#pragma omp parallel for // TODO: Depending in circumstance, it may be more efficient to parallelize over rows. + foreach_column(jIn, a) + { + auto jOutF = m(0, jIn); // this is the column we copy/add into + if (jOutF < 0) // negative index means gap + continue; + size_t jOut = (size_t)jOutF; + if (jOut >= GetNumCols()) + InvalidArgument("DoGatherColumnsOf: Map out of bounds."); + ScaleAndAddColumn(beta, &us(0, jOut), &a(0, jIn), us.GetNumRows(), alpha); + } + + return *this; +} + template void CPUMatrix::SetValue(const ElemType v) { @@ -4629,7 +4708,7 @@ void CPUMatrix::AssignScaledDifference(const CPUMatrix& alph /// Input matrix /// Resulting matrix, user is responsible for allocating this template -void CPUMatrix::Scale(ElemType alpha, const CPUMatrix& a, CPUMatrix& c) +/*static*/ void CPUMatrix::Scale(ElemType alpha, const CPUMatrix& a, CPUMatrix& c) { if (a.IsEmpty()) LogicError("Scale: Input matrix a is empty."); @@ -4640,6 +4719,12 @@ void CPUMatrix::Scale(ElemType alpha, const CPUMatrix& a, CP assert(m > 0 && n > 0); // converting from size_t to int may cause overflow c.Resize(m, n); + if (alpha == 0) + { + memset(c.m_pArray, 0, sizeof(ElemType) * c.GetNumElements()); + return; + } + long size = (long) c.GetNumElements(); #pragma omp parallel for // four-way unrolling @@ -4650,7 +4735,7 @@ void CPUMatrix::Scale(ElemType alpha, const CPUMatrix& a, CP c.m_pArray[i + 2] = alpha * a.m_pArray[i + 2]; c.m_pArray[i + 3] = alpha * a.m_pArray[i + 3]; } - // handle remaining stuffs + // remaining elements for (long i = size & ~3; i < size; i++) { c.m_pArray[i] = alpha * a.m_pArray[i]; @@ -4661,7 +4746,7 @@ void CPUMatrix::Scale(ElemType alpha, const CPUMatrix& a, CP /// Scalar /// Input matrix template -void CPUMatrix::Scale(ElemType alpha, CPUMatrix& a) +/*static*/ void CPUMatrix::Scale(ElemType alpha, CPUMatrix& a) { if (a.IsEmpty()) LogicError("Scale: Input matrix a is empty."); @@ -4673,10 +4758,14 @@ void CPUMatrix::Scale(ElemType alpha, CPUMatrix& a) assert(m > 0 && n > 0 && len > 0); // converting from size_t to int may cause overflow - if (sizeof(ElemType) == sizeof(double)) + if (alpha == 0 && incx == 1) + { + memset(a.m_pArray, 0, sizeof(ElemType) * len); + } + else if (sizeof(ElemType) == sizeof(double)) { #ifdef USE_ACML - dscal(len, alpha, reinterpret_cast(a.m_pArray), incx); + dscal(len, alpha, reinterpret_cast(a.m_pArray), incx); // TODO: Use overloads. #else cblas_dscal(len, alpha, reinterpret_cast(a.m_pArray), incx); #endif @@ -4696,7 +4785,7 @@ void CPUMatrix::Scale(ElemType alpha, CPUMatrix& a) /// 1x1 matrix /// Input matrix template -void CPUMatrix::Scale(CPUMatrix alpha, CPUMatrix& a) +/*static*/ void CPUMatrix::Scale(CPUMatrix alpha, CPUMatrix& a) { if (a.IsEmpty()) LogicError("Scale: Input matrix a is empty."); diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h index 2a8f7dbd3619..3de2abfb1825 100644 --- a/Source/Math/CPUMatrix.h +++ b/Source/Math/CPUMatrix.h @@ -53,6 +53,7 @@ class MATH_API CPUMatrix : public BaseMatrix using B::GetNumRows; using B::GetNumCols; using B::SetOwnBuffer; + using B::VerifySize; size_t BufferSize() const { @@ -120,35 +121,38 @@ class MATH_API CPUMatrix : public BaseMatrix CPUMatrix Transpose(); CPUMatrix& AssignTransposeOf(const CPUMatrix& a); + CPUMatrix& DoGatherColumnsOf (ElemType beta, const CPUMatrix& m, const CPUMatrix& a, ElemType alpha); + CPUMatrix& DoScatterColumnsOf(ElemType beta, const CPUMatrix& m, const CPUMatrix& a, ElemType alpha); + CPUMatrix& operator+=(const ElemType alpha); - CPUMatrix operator+(const ElemType alpha) const; + CPUMatrix operator+(const ElemType alpha) const; CPUMatrix& AssignSumOf(const ElemType alpha, const CPUMatrix& a); CPUMatrix& operator+=(const CPUMatrix& a); - CPUMatrix operator+(const CPUMatrix& a) const; + CPUMatrix operator+(const CPUMatrix& a) const; CPUMatrix& AssignSumOf(const CPUMatrix& a, const CPUMatrix& b); CPUMatrix& operator-=(const ElemType alpha); - CPUMatrix operator-(const ElemType alpha) const; + CPUMatrix operator-(const ElemType alpha) const; CPUMatrix& AssignDifferenceOf(const ElemType alpha, const CPUMatrix& a); CPUMatrix& AssignDifferenceOf(const CPUMatrix& a, const ElemType alpha); CPUMatrix& operator-=(const CPUMatrix& a); - CPUMatrix operator-(const CPUMatrix& a) const; + CPUMatrix operator-(const CPUMatrix& a) const; CPUMatrix& AssignDifferenceOf(const CPUMatrix& a, const CPUMatrix& b); CPUMatrix& operator*=(const ElemType alpha); - CPUMatrix operator*(const ElemType alpha) const; + CPUMatrix operator*(const ElemType alpha) const; CPUMatrix& AssignProductOf(const ElemType alpha, const CPUMatrix& a); - CPUMatrix operator*(const CPUMatrix& a) const; + CPUMatrix operator*(const CPUMatrix& a) const; CPUMatrix& AssignProductOf(const CPUMatrix& a, const bool transposeA, const CPUMatrix& b, const bool transposeB); CPUMatrix& operator/=(ElemType alpha); - CPUMatrix operator/(ElemType alpha) const; + CPUMatrix operator/(ElemType alpha) const; CPUMatrix& operator^=(ElemType alpha); // element-wise power - CPUMatrix operator^(ElemType alpha) const; // element-wise power + CPUMatrix operator^(ElemType alpha) const; // element-wise power CPUMatrix& AssignElementPowerOf(const CPUMatrix& a, const ElemType power); CPUMatrix& ElementMultiplyWith(const CPUMatrix& a); diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp index 4c425586bc17..d884aba884b4 100644 --- a/Source/Math/Matrix.cpp +++ b/Source/Math/Matrix.cpp @@ -1069,6 +1069,43 @@ Matrix& Matrix::AssignTransposeOf(const Matrix& a) return *this; } +// *this[:,j] = a[:,m[j]] * alpha + *this[:,j] * beta +// m has width of 'this' and contains values w.r.t. 'a' +// Invalid entries (gap columns) are denoted by m(0,j) == -1. +template +Matrix& Matrix::DoGatherColumnsOf(ElemType beta, const Matrix& m, const Matrix& a, ElemType alpha) +{ + DecideAndMoveToRightDevice(*this, m, a); // TODO: only move target if beta != 0 + + DISPATCH_MATRIX_ON_FLAG(&a, + this, + m_CPUMatrix->DoGatherColumnsOf(beta, *m.m_CPUMatrix, *a.m_CPUMatrix, alpha), + NOT_IMPLEMENTED, //m_GPUMatrix->DoGatherColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED); + + return *this; +} + +// *this[:,m[j]] = a[:,j] * alpha + *this[:,m[j]] * beta +// m has width of 'a' and contains values w.r.t. 'this' +// Unlike gather, for scatter, 'this' must have been sized already. +// Invalid entries (gap columns) are denoted by m(0,j) == -1. +template +Matrix& Matrix::DoScatterColumnsOf(ElemType beta, const Matrix& m, const Matrix& a, ElemType alpha) +{ + DecideAndMoveToRightDevice(*this, m, a); // TODO: only move target if beta != 0 + + DISPATCH_MATRIX_ON_FLAG(&a, + this, + m_CPUMatrix->DoScatterColumnsOf(beta, *m.m_CPUMatrix, *a.m_CPUMatrix, alpha), + NOT_IMPLEMENTED, //m_GPUMatrix->DoScatterColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha), + NOT_IMPLEMENTED, + NOT_IMPLEMENTED); + + return *this; +} + // set all elements of a matrix to a scalar value // For sparse matrices, the only allowed value is 0. template @@ -1318,7 +1355,7 @@ void Matrix::SetGaussianRandomValue(const ElemType mean, const ElemTyp InvalidArgument("SetUniformRandomValue: sigma must be a positive value."); if (IsEmpty()) - LogicError("SetUniformRandomValue: Matrix is empty."); + return; DISPATCH_MATRIX_ON_FLAG(this, this, @@ -3424,6 +3461,9 @@ int Matrix::GetDeviceId() const return m_GPUSparseMatrix->GetComputeDeviceId()); } +// TODO: Move the shared core functions to the front of this source file. +// BUGBUG: This performs a copy operation even for the output matrix that gets overwritten right away. +// We should (1) define which is the output and (2) whether it will be completely overwritten (so we won't actually copy it). // bring two matrices onto the same device // If different and prefered devices are the same, move to preferred device. // Otherwise GPU takes precedence over CPU, and if both are GPU move to a's device. @@ -4454,13 +4494,21 @@ template void Matrix::Scale(ElemType alpha, const Matrix& a, Matrix& c) { DecideAndMoveToRightDevice(c, a); + c.SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false); - DISPATCH_MATRIX_ON_FLAG(&c, - &c, - CPUMatrix::Scale(alpha, *a.m_CPUMatrix, *c.m_CPUMatrix), - GPUMatrix::Scale(alpha, *a.m_GPUMatrix, *c.m_GPUMatrix), - NOT_IMPLEMENTED, * c.m_GPUSparseMatrix = (*a.m_GPUSparseMatrix) * alpha); + if (alpha == 0) + { + c.Resize(a); + c.SetValue(0); // this is a little faster, and also does not propagate NaNs, which we'd expect from 'beta' parameters + return; + } + else + DISPATCH_MATRIX_ON_FLAG(&c, + &c, + CPUMatrix::Scale(alpha, *a.m_CPUMatrix, *c.m_CPUMatrix), + GPUMatrix::Scale(alpha, *a.m_GPUMatrix, *c.m_GPUMatrix), + NOT_IMPLEMENTED, * c.m_GPUSparseMatrix = (*a.m_GPUSparseMatrix) * alpha); } /// Matrix-scalar multiply with col-major matrices: a = alpha * a @@ -4469,15 +4517,17 @@ void Matrix::Scale(ElemType alpha, const Matrix& a, Matrix void Matrix::Scale(ElemType alpha, Matrix& a) { - if (a.IsEmpty()) + if (alpha == 0) + a.SetValue(0); // this is a little faster, and also does not propagate NaNs, which we'd expect from 'beta' parameters + else if (a.IsEmpty()) return; - - DISPATCH_MATRIX_ON_FLAG(&a, - &a, - CPUMatrix::Scale(alpha, *a.m_CPUMatrix), - GPUMatrix::Scale(alpha, *a.m_GPUMatrix), - NOT_IMPLEMENTED, - GPUSparseMatrix::Scale(alpha, *a.m_GPUSparseMatrix)); + else + DISPATCH_MATRIX_ON_FLAG(&a, + &a, + CPUMatrix::Scale(alpha, *a.m_CPUMatrix), + GPUMatrix::Scale(alpha, *a.m_GPUMatrix), + NOT_IMPLEMENTED, + GPUSparseMatrix::Scale(alpha, *a.m_GPUSparseMatrix)); } /// Matrix scalar matrix multiply with col-major matrices: a = alpha[0,0] * a diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index a900d0042dbb..d0540a6f4640 100644 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -176,7 +176,7 @@ class MATH_API Matrix : public MatrixBase ElemType RmsProp(Matrix& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier); void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 10000, bool growOnly = true); // by default we only reallocate if need to grow - void Resize(const Matrix& other) + void Resize(const Matrix& other) // TODO: Should this carry over numNZElemToReserve for sparse matrices? { Resize(other.GetNumRows(), other.GetNumCols()); } @@ -255,6 +255,9 @@ class MATH_API Matrix : public MatrixBase Matrix Transpose(); // This method doesn't change state of Matrix. It should be a const function Matrix& AssignTransposeOf(const Matrix& a); + Matrix& DoGatherColumnsOf (ElemType beta, const Matrix& m, const Matrix& a, ElemType alpha); + Matrix& DoScatterColumnsOf(ElemType beta, const Matrix& m, const Matrix& a, ElemType alpha); + Matrix& operator+=(const ElemType alpha); Matrix operator+(const ElemType alpha) const; Matrix& AssignSumOf(const ElemType alpha, const Matrix& a); From 0c62fb9f7f0c84aaf5205f9e2ebe72d197487b60 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Sat, 12 Mar 2016 19:04:45 -0800 Subject: [PATCH 12/26] GPU version of Gather() and Scatter() --- Source/Common/Include/Sequences.h | 9 +- .../ComputationNetworkLib/ReshapingNodes.cpp | 1 + Source/Math/CPUMatrix.cpp | 4 +- Source/Math/GPUMatrix.cu | 134 ++++++++++++++++-- Source/Math/GPUMatrix.h | 3 + Source/Math/Matrix.cpp | 36 ++--- Source/Math/Matrix.h | 11 +- 7 files changed, 162 insertions(+), 36 deletions(-) diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h index 829c54c173ff..ed63912fe8a9 100644 --- a/Source/Common/Include/Sequences.h +++ b/Source/Common/Include/Sequences.h @@ -1062,12 +1062,13 @@ static inline void MaskMissingColumnsTo(Matrix& matrixToMask, const MB if (pMBLayout && pMBLayout->HasGaps(fr)) { #if 0 // in the future we can use the tensor lib to implement this - const auto & maskMatrix = pMBLayout->GetColumnsValidMask(); - auto maskSlice = DataWithMBLayoutFor(maskMatrix, fr, pMBLayout); - auto matrixSliceToMask = DataWithMBLayoutFor(matrixToMask, fr, pMBLayout); - TensorView(matrixSliceToMask).DoMaskNegativeOf(0, TensorView(matrixSliceToMask), TensorView(maskSlice), 1); val; + const auto & maskMatrix = pMBLayout->GetColumnsValidMask(); + auto maskSlice = DataWithMBLayoutFor(maskMatrix, fr, pMBLayout); + auto matrixSliceToMask = DataWithMBLayoutFor(matrixToMask, fr, pMBLayout); + TensorView(matrixSliceToMask).DoMaskNegativeOf(0, TensorView(matrixSliceToMask), TensorView(maskSlice), 1); val; #else const auto& maskMatrix = pMBLayout->GetColumnsValidityMask(matrixToMask.GetDeviceId()); + maskMatrix.TransferToDeviceIfNotThere(matrixToMask.GetDeviceId(), /*ismoved=*/ false, /*emptyTransfer=*/ false, /*updatePreferredDevice=*/ false); auto maskSlice = DataWithMBLayoutFor(maskMatrix, fr, pMBLayout); auto matrixSliceToMask = DataWithMBLayoutFor(matrixToMask, fr, pMBLayout); matrixSliceToMask.MaskColumnsValue(maskSlice, val); diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp index eaf2217057a0..9f07164d63b2 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.cpp +++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp @@ -89,6 +89,7 @@ template buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType)indexSequence[t]; } // the result will be kept in CPUDEVICE, since most likely we will access it again in PackedIndexNode + Value().TransferToDeviceIfNotThere(CPUDEVICE, /*isBeingMoved=*/ true, /*emptyTransfer=*/ true, /*updatePreferredDevice=*/ true); Value().SetValue(1, outMBLayout->GetNumCols(), CPUDEVICE, buf.data(), MatrixFormat::matrixFormatColMajor); } diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp index 6b411dc4929a..e0fd090ab25a 100644 --- a/Source/Math/CPUMatrix.cpp +++ b/Source/Math/CPUMatrix.cpp @@ -709,7 +709,7 @@ CPUMatrix& CPUMatrix::DoGatherColumnsOf(ElemType beta, const Resize(a.GetNumRows(), m.GetNumCols()); auto& us = *this; -//#pragma omp parallel for // TODO: Depending in circumstance, it may be more efficient to parallelize over rows. +#pragma omp parallel for // TODO: Depending in circumstance, it may be more efficient to parallelize over rows. foreach_column(jOut, us) { auto jInF = m(0, jOut); // this is the column we need to get @@ -750,7 +750,7 @@ CPUMatrix& CPUMatrix::DoScatterColumnsOf(ElemType beta, cons size_t jOut = (size_t)jOutF; if (jOut >= GetNumCols()) InvalidArgument("DoGatherColumnsOf: Map out of bounds."); - ScaleAndAddColumn(beta, &us(0, jOut), &a(0, jIn), us.GetNumRows(), alpha); + ScaleAndAddColumn(/*beta=*/(ElemType)1, &us(0, jOut), &a(0, jIn), us.GetNumRows(), alpha); } return *this; diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu index 8c31ec4c9a50..f0f3fbc4e3e7 100644 --- a/Source/Math/GPUMatrix.cu +++ b/Source/Math/GPUMatrix.cu @@ -918,11 +918,120 @@ GPUMatrix& GPUMatrix::AssignTransposeOf(const GPUMatrix +__global__ void _doGatherColumnsOf(ElemType* us, size_t usStride, const ElemType beta, const ElemType* m, size_t mStride, const ElemType* a, size_t aStride, size_t aCols, const ElemType alpha) +{ + size_t i = threadIdx.x; // index into 'us' and 'a' + size_t jOut = blockIdx.x; // index into 'us' and 'm' + + auto jInF = m[jOut * mStride]; // this is the column we need to get + if (jInF < 0) // negative index means gap + return; + size_t jIn = (size_t)jInF; + if (jIn >= aCols) + return; // actually a failure + + const ElemType& ra = a[i + jIn * aStride]; + ElemType& rus = us[i + jOut * usStride]; + + ElemType res = ra * alpha; + if (beta != 0) + res += rus * beta; + rus = res; +} + +// *this[:,j] = a[:,m[j]] * alpha + *this[:,j] * beta +template +GPUMatrix& GPUMatrix::DoGatherColumnsOf(ElemType beta, const GPUMatrix& m, const GPUMatrix& a, ElemType alpha) +{ + if (m.GetNumRows() != 1) // index is 1-dimensional only + InvalidArgument("DoGatherColumnsOf: Map must be a row vector."); + + if (beta) + VerifySize(a.GetNumRows(), m.GetNumCols()); + else + Resize(a.GetNumRows(), m.GetNumCols()); + + if (m.GetComputeDeviceId() != a.GetComputeDeviceId() || GetComputeDeviceId() != a.GetComputeDeviceId()) + InvalidArgument("All matrices must be on the same GPU"); + a.PrepareDevice(); + + SyncGuard syncGuard; + _doGatherColumnsOf << > >(m_pArray, GetNumRows(), beta, m.m_pArray, 1, a.m_pArray, a.GetNumRows(), a.GetNumCols(), alpha); + + return *this; +} + +template +__global__ void _doScatterColumnsOf(ElemType* us, size_t usStride, size_t usCols, const ElemType* m, size_t mStride, const ElemType* a, size_t aStride, const ElemType alpha) +{ + size_t i = threadIdx.x; // index into 'a' and 'us' + size_t jIn = blockIdx.x; // index into 'a' and 'm' + + auto jOutF = m[jIn * mStride]; // this is the column we copy/add into + if (jOutF < 0) // negative index means gap + return; + size_t jOut = (size_t)jOutF; + if (jOut >= usCols) + return; // actually a failure + + const ElemType& ra = a[i + jIn * aStride]; + ElemType& rus = us[i + jOut * usStride]; + + ElemType res = ra * alpha; +#if 0 // this is not the reason. Some stupid bad index. + rus += res; +#else + atomicAdd(&rus, res); +#endif + // Note: atomicAdd() is supposed to be fast in case of no conflict (the simple case of Scatter()) +} + +// little helper for debugging +template +static void Peek(const GPUMatrix& m, const char* which) +{ + size_t rows = m.GetNumRows(); + size_t cols = m.GetNumCols(); + ElemType buf[100] = { 0 }; + size_t n = min(rows * cols, _countof(buf)); + cudaMemcpy(buf, m.BufferPointer(), sizeof(ElemType) * n, cudaMemcpyDeviceToHost); + UNUSED(which); UNUSED(rows); UNUSED(cols); sin(1.0f); // set breakpoint here +} + +// *this[:,m[j]] = a[:,j] * alpha + *this[:,m[j]] * beta +template +GPUMatrix& GPUMatrix::DoScatterColumnsOf(ElemType beta, const GPUMatrix& m, const GPUMatrix& a, ElemType alpha) +{ + if (m.GetNumRows() != 1) // index is 1-dimensional only + InvalidArgument("DoScatterColumnsOf: Map must be a row vector."); + if (m.GetNumCols() != a.GetNumCols()) + InvalidArgument("DoScatterColumnsOf: Map must have width of input vector."); + if (a.GetNumRows() != GetNumRows()) + InvalidArgument("DoScatterColumnsOf: Output must have same height as input vector."); + + if (m.GetComputeDeviceId() != a.GetComputeDeviceId() || GetComputeDeviceId() != a.GetComputeDeviceId()) + InvalidArgument("All matrices must be on the same GPU"); + a.PrepareDevice(); + + auto& us = *this; + //Peek(us, "us"); Peek(m, "m"); Peek(a, "a"); + + // pre-scale with beta upfront + // Scatter may add more than one source column to the same target, so we must pre-scale with beta, and then just keep adding. + Scale(beta, us); // if beta is 0, then this will be a memset() + + SyncGuard syncGuard; + _doScatterColumnsOf << > >(m_pArray, GetNumRows(), GetNumCols(), m.m_pArray, 1, a.m_pArray, a.GetNumRows(), alpha); + + return *this; +} + template void GPUMatrix::SetValue(const ElemType v) { if (IsEmpty()) - LogicError("SetValue: Matrix is empty."); + return; CUDA_LONG N = (CUDA_LONG) GetNumElements(); @@ -2979,7 +3088,7 @@ void GPUMatrix::Multiply(const GPUMatrix& a, const GPUMatrix /// Input matrix /// Resulting matrix, user is responsible for allocating this template -void GPUMatrix::ScaleAndAdd(ElemType alpha, const GPUMatrix& a, GPUMatrix& c) +/*static*/ void GPUMatrix::ScaleAndAdd(ElemType alpha, const GPUMatrix& a, GPUMatrix& c) { if (a.GetComputeDeviceId() != c.GetComputeDeviceId()) { @@ -2987,6 +3096,8 @@ void GPUMatrix::ScaleAndAdd(ElemType alpha, const GPUMatrix& } else { + if (a.IsEmpty() && c.IsEmpty()) + return; a.PrepareDevice(); if (a.IsEmpty() || c.IsEmpty()) LogicError("ScaleAndAdd: one of the input matrices is empty."); @@ -3088,7 +3199,7 @@ void GPUMatrix::ScaleAndAdd(ElemType alpha, const GPUMatrix& /// Input matrix /// Resulting matrix, user is responsible for allocating this template -void GPUMatrix::ScaleAndAdd(ElemType alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) +/*static*/ void GPUMatrix::ScaleAndAdd(ElemType alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c) { if (a.GetComputeDeviceId() != c.GetComputeDeviceId() || a.GetComputeDeviceId() != b.GetComputeDeviceId()) { @@ -3096,6 +3207,8 @@ void GPUMatrix::ScaleAndAdd(ElemType alpha, const GPUMatrix& } else { + if (a.IsEmpty() && b.IsEmpty()) + return; a.PrepareDevice(); if (a.IsEmpty() || b.IsEmpty()) LogicError("ScaleAndAdd: one of the input matrices is empty."); @@ -3321,8 +3434,14 @@ void GPUMatrix::AddElementToElement(const GPUMatrix& a, cons } template -void GPUMatrix::Scale(ElemType alpha, GPUMatrix& a) +/*static*/ void GPUMatrix::Scale(ElemType alpha, GPUMatrix& a) { + if (alpha == 0) // if 0 then do not access the value, so that we can use this to multiply uninitialized matrices with beta=0 + { + CUDA_CALL(cudaMemset(a.m_pArray, 0, a.m_numRows * a.m_numCols * sizeof(ElemType))); + return; + } + cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); if (sizeof(ElemType) == sizeof(float)) { @@ -3341,7 +3460,7 @@ void GPUMatrix::Scale(ElemType alpha, GPUMatrix& a) } template -void GPUMatrix::Scale(GPUMatrix& alpha, GPUMatrix& a) +/*static*/ void GPUMatrix::Scale(GPUMatrix& alpha, GPUMatrix& a) { if (alpha.GetNumElements() != 1) { @@ -3366,11 +3485,8 @@ void GPUMatrix::Scale(GPUMatrix& alpha, GPUMatrix& } template // c = alpha * a -void GPUMatrix::Scale(ElemType alpha, const GPUMatrix& a, GPUMatrix& c) +/*static*/ void GPUMatrix::Scale(ElemType alpha, const GPUMatrix& a, GPUMatrix& c) { - if (a.IsEmpty()) - LogicError("Scale: Input matrix a is empty."); - c = a; Scale(alpha, c); } diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h index 822db1be5f5e..39b9d74ca0ef 100644 --- a/Source/Math/GPUMatrix.h +++ b/Source/Math/GPUMatrix.h @@ -210,6 +210,9 @@ class MATH_API GPUMatrix : public BaseMatrix GPUMatrix Transpose() const; GPUMatrix& AssignTransposeOf(const GPUMatrix& a); + GPUMatrix& DoGatherColumnsOf (ElemType beta, const GPUMatrix& m, const GPUMatrix& a, ElemType alpha); + GPUMatrix& DoScatterColumnsOf(ElemType beta, const GPUMatrix& m, const GPUMatrix& a, ElemType alpha); + GPUMatrix& operator+=(const ElemType alpha); GPUMatrix operator+(const ElemType alpha) const; GPUMatrix& AssignSumOf(const ElemType alpha, const GPUMatrix& a); diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp index d884aba884b4..4d99d26b81bf 100644 --- a/Source/Math/Matrix.cpp +++ b/Source/Math/Matrix.cpp @@ -1080,7 +1080,7 @@ Matrix& Matrix::DoGatherColumnsOf(ElemType beta, const Matri DISPATCH_MATRIX_ON_FLAG(&a, this, m_CPUMatrix->DoGatherColumnsOf(beta, *m.m_CPUMatrix, *a.m_CPUMatrix, alpha), - NOT_IMPLEMENTED, //m_GPUMatrix->DoGatherColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha), + m_GPUMatrix->DoGatherColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha), NOT_IMPLEMENTED, NOT_IMPLEMENTED); @@ -1099,7 +1099,7 @@ Matrix& Matrix::DoScatterColumnsOf(ElemType beta, const Matr DISPATCH_MATRIX_ON_FLAG(&a, this, m_CPUMatrix->DoScatterColumnsOf(beta, *m.m_CPUMatrix, *a.m_CPUMatrix, alpha), - NOT_IMPLEMENTED, //m_GPUMatrix->DoScatterColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha), + m_GPUMatrix->DoScatterColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha), NOT_IMPLEMENTED, NOT_IMPLEMENTED); @@ -1167,10 +1167,12 @@ template void Matrix::MaskColumnsValue(const Matrix& columnsMask, ElemType val) { if (GetNumCols() != columnsMask.GetNumCols()) - RuntimeError("Matrix and column mask must have equal number of columns"); + RuntimeError("MaskColumnsValue: Matrix and column mask must have equal number of columns."); - if (GetDeviceId() != columnsMask.GetDeviceId()) - RuntimeError("Matrix and column mask must be on the same device"); + if (GetCurrentMatrixLocation() == CPU && (columnsMask.GetCurrentMatrixLocation() == CPU || columnsMask.GetCurrentMatrixLocation() == BOTH)) + ; // OK + else if (GetDeviceId() != columnsMask.GetDeviceId() && columnsMask.GetCurrentMatrixLocation() != BOTH) + RuntimeError("MaskColumnsValue: Matrix and column mask must be on the same device."); DISPATCH_MATRIX_ON_FLAG(this, this, @@ -3470,7 +3472,8 @@ int Matrix::GetDeviceId() const // The inputs are only distinguished in that a's GPU takes precedence over b's in case they differ. // TODO: This is called somewhat inconsistently, sometimes with a=*this, sometimes with b=*this. template -void Matrix::DecideAndMoveToRightDevice(const Matrix& a, const Matrix& b) +template +void Matrix::DecideAndMoveToRightDevice(const Matrix& a, const Matrix& b) { int deviceIdA = a.GetDeviceId(), deviceIdB = b.GetDeviceId(); if (deviceIdA == deviceIdB) @@ -3541,21 +3544,21 @@ void Matrix::DecideAndMoveToRightDevice(const Matrix& a, con } template -void Matrix::_transferToDevice(int to_id, bool ismoved /*= true*/, bool emptyTransfer /* = false*/) const +void Matrix::_transferToDevice(int to_id, bool isBeingMoved /*= true*/, bool emptyTransfer /* = false*/) const { int from_id = GetDeviceId(); if (to_id == from_id) // nothing to do return; if (OwnBuffer()) - _transferFromDeviceToDevice(from_id, to_id, ismoved, emptyTransfer); + _transferFromDeviceToDevice(from_id, to_id, isBeingMoved, emptyTransfer); else RuntimeError("Cannot move externally owned matrices to the preferred device."); } // this function performs data transfer and updates data location, but not the device that is stored with it template -void Matrix::_transferFromDeviceToDevice(int from_id, int to_id, bool ismoved /*= true*/, bool emptyTransfer /* = false*/) const +void Matrix::_transferFromDeviceToDevice(int from_id, int to_id, bool isBeingMoved /*= true*/, bool emptyTransfer /* = false*/) const { if (from_id < 0) from_id = CPUDEVICE; @@ -3606,7 +3609,7 @@ void Matrix::_transferFromDeviceToDevice(int from_id, int to_id, bool m_GPUSparseMatrix->SetValue(*m_CPUSparseMatrix); } - if (ismoved) + if (isBeingMoved) { delete m_CPUSparseMatrix; m_CPUSparseMatrix = NULL; @@ -3632,7 +3635,7 @@ void Matrix::_transferFromDeviceToDevice(int from_id, int to_id, bool m_GPUSparseMatrix->CopyToCPUSparseMatrix(*m_CPUSparseMatrix); } - if (ismoved) + if (isBeingMoved) { delete m_GPUSparseMatrix; m_GPUSparseMatrix = NULL; @@ -3666,7 +3669,7 @@ void Matrix::_transferFromDeviceToDevice(int from_id, int to_id, bool { m_GPUMatrix = new GPUMatrix(to_id); } - if (ismoved) + if (isBeingMoved) { delete m_CPUMatrix; m_CPUMatrix = NULL; @@ -3698,7 +3701,7 @@ void Matrix::_transferFromDeviceToDevice(int from_id, int to_id, bool m_CPUMatrix = new CPUMatrix(); } - if (ismoved) + if (isBeingMoved) { delete m_GPUMatrix; m_GPUMatrix = NULL; @@ -3718,9 +3721,9 @@ void Matrix::_transferFromDeviceToDevice(int from_id, int to_id, bool } template -void Matrix::TransferFromDeviceToDevice(int from_id, int to_id, bool ismoved, bool emptyTransfer/* = false*/, bool updatePreferredDevice/* = true*/) const +void Matrix::TransferFromDeviceToDevice(int from_id, int to_id, bool isBeingMoved, bool emptyTransfer/* = false*/, bool updatePreferredDevice/* = true*/) const { - _transferFromDeviceToDevice(from_id, to_id, ismoved, emptyTransfer); + _transferFromDeviceToDevice(from_id, to_id, isBeingMoved, emptyTransfer); if (updatePreferredDevice) m_preferredDeviceId = GetDeviceId(); } @@ -5126,7 +5129,8 @@ template char* Matrix::BufferPointer() const; template int Matrix::GetDeviceId() const; template size_t Matrix::GetNumElements() const; template Matrix Matrix::ColumnSlice(size_t startColumn, size_t numCols) const; -template void Matrix::_transferToDevice(int id_to, bool ismoved, bool emptyTransfer) const; +template void Matrix::_transferToDevice(int id_to, bool isBeingMoved, bool emptyTransfer) const; +template void Matrix::TransferToDeviceIfNotThere(int id_to, bool isBeingMoved, bool emptyTransfer, bool updatePreferredDevice) const; template size_t Matrix::GetNumRows() const; template size_t Matrix::GetNumCols() const; template void Matrix::SetValue(const char); diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index d0540a6f4640..687b74b272a2 100644 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -74,10 +74,11 @@ class MATH_API Matrix : public MatrixBase mutable int m_devicesTransferedTo[2]; // TODO: what is this for? Seems only diagnostics // Moves matrix from device id_from to device with id_to. This method doesn't change preferred device Id - void _transferFromDeviceToDevice(int id_from, int id_to, bool ismoved = true, bool emptyTransfer = false) const; + void _transferFromDeviceToDevice(int id_from, int id_to, bool isBeingMoved = true, bool emptyTransfer = false) const; // Moves matrix from current device to device with id_to. This method doesn't change preferred device Id - void _transferToDevice(int id_to, bool ismoved = true, bool emptyTransfer = false) const; - static void DecideAndMoveToRightDevice(const Matrix& a, const Matrix& b); + void _transferToDevice(int id_to, bool isBeingMoved = true, bool emptyTransfer = false) const; + template + static void DecideAndMoveToRightDevice(const Matrix& a, const Matrix& b); static void DecideAndMoveToRightDevice(const Matrix& a, const Matrix& b, const Matrix& c); static void DecideAndMoveToRightDevice(const Matrix& a, const Matrix& b, const Matrix& c, const Matrix& d); static void CopyElementsFromDenseToSparse(CPUMatrix& from, CPUSparseMatrix& dest); @@ -132,9 +133,9 @@ class MATH_API Matrix : public MatrixBase void SetPreferredDeviceId(DEVICEID_TYPE preferredDeviceId) { m_preferredDeviceId = preferredDeviceId; } // Moves matrix from device id_from to device with id_to. // If emptyTransfer=true, then no data is ever moved, just corresponding GPU/CPU matrices are deleted and then created using empty constructor - void TransferFromDeviceToDevice(int id_from, int id_to, bool ismoved = false, /*if false then keep source and set location to BOTH*/ bool emptyTransfer = false, bool updatePreferredDevice = true) const; + void TransferFromDeviceToDevice(int id_from, int id_to, bool isBeingMoved = false, /*if false then keep source and set location to BOTH*/ bool emptyTransfer = false, bool updatePreferredDevice = true) const; // Same as TransferFromDeviceToDevice() but moves only if it is currently not on the target device - void TransferToDeviceIfNotThere(int id_to, bool ismoved = false, bool emptyTransfer = false, bool updatePreferredDevice = true) const; + void TransferToDeviceIfNotThere(int id_to, bool isBeingMoved = false, bool emptyTransfer = false, bool updatePreferredDevice = true) const; CurrentDataLocation GetCurrentMatrixLocation() const { return m_currentDataLocation; }; void SwitchToMatrixType(MatrixType newMatrixType, MatrixFormat newMatrixFormat, bool keepValues); // sets matrix type between dense and sparse size_t GetNumRows() const; From 7b539bdbefad32d7c86c10edbc5a74e389af468d Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Sun, 13 Mar 2016 09:56:07 -0700 Subject: [PATCH 13/26] (fixed NoGPU.cpp for last commit) --- Source/Math/NoGPU.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Source/Math/NoGPU.cpp b/Source/Math/NoGPU.cpp index 47a6c2392cf2..13993c4b6e37 100644 --- a/Source/Math/NoGPU.cpp +++ b/Source/Math/NoGPU.cpp @@ -921,6 +921,18 @@ GPUMatrix& GPUMatrix::AssignTransposeOf(const GPUMatrix +GPUMatrix& GPUMatrix::DoGatherColumnsOf(ElemType beta, const GPUMatrix& m, const GPUMatrix& a, ElemType alpha) +{ + return *this; +} + +template +GPUMatrix& GPUMatrix::DoScatterColumnsOf(ElemType beta, const GPUMatrix& m, const GPUMatrix& a, ElemType alpha) +{ + return *this; +} + template void GPUMatrix::SetValue(const ElemType v) { From 31f81c5a1be5c4e4002ac2d46ad11689e925071b Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Sun, 13 Mar 2016 11:40:39 -0700 Subject: [PATCH 14/26] minor refactoring of m_nameToNodeMap; new class ComputationEnvironment through which the network can share information with all nodes --- .../ComputationEnvironment.h | 32 ++++++ .../ComputationNetwork.cpp | 8 +- .../ComputationNetwork.h | 57 +++++++--- .../ComputationNetworkEditing.cpp | 102 ++++++++++-------- .../ComputationNetworkLib.vcxproj | 1 + .../ComputationNetworkLib.vcxproj.filters | 6 ++ .../ComputationNetworkLib/ComputationNode.h | 16 ++- Source/Math/GPUMatrix.h | 1 + 8 files changed, 158 insertions(+), 65 deletions(-) create mode 100644 Source/ComputationNetworkLib/ComputationEnvironment.h diff --git a/Source/ComputationNetworkLib/ComputationEnvironment.h b/Source/ComputationNetworkLib/ComputationEnvironment.h new file mode 100644 index 000000000000..d860208d3431 --- /dev/null +++ b/Source/ComputationNetworkLib/ComputationEnvironment.h @@ -0,0 +1,32 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. +// + +#pragma once + +#include "Basics.h" + +#include + +namespace Microsoft { namespace MSR { namespace CNTK { + +// =========================================================================== +// ComputationEnvironment -- computation graph and operations +// =========================================================================== + +enum class NetworkOperationMode +{ + unspecified, + training, + inferring, + precomputing +}; + +struct ComputationEnvironment +{ + NetworkOperationMode networkOperationMode = NetworkOperationMode::unspecified; +}; +typedef shared_ptr ComputationEnvironmentPtr; + +}}} diff --git a/Source/ComputationNetworkLib/ComputationNetwork.cpp b/Source/ComputationNetworkLib/ComputationNetwork.cpp index c0f8d5e3464c..6898a406cdc4 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.cpp +++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp @@ -64,7 +64,11 @@ void ComputationNetwork::ClearNetwork() // Once we allow that (BrainScript editing), we need proper cycle detectors. Luckily, we know our cycles, so it won't be too hard. // Or just use weak ptrs. for (auto& iter : m_nameToNodeMap) - iter.second->DetachInputs(); + { + auto& node = iter.second; + node->SetEnvironment(nullptr); + node->DetachInputs(); + } m_nameToNodeMap.clear(); @@ -1022,7 +1026,7 @@ void ComputationNetwork::PerformSVDecomposition(const map& SVDCo redVT.ColumnElementMultiplyWith(redS); // Step 2. create two new Parameter nodes and one Times node - wstring leftChildName = name + L"-U"; + wstring leftChildName = name + L"-U"; // BUGBUG: With BrainScript, node names must be proper identifieres/variable expressions. We can't have '-' in node names. wstring rightChildName = name + L"-V"; shared_ptr> pLeft = AddNodeToNetWithElemType(New>(m_deviceId, leftChildName, m, r)); shared_ptr> pRight = AddNodeToNetWithElemType(New>(m_deviceId, rightChildName, r, n)); diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h index 074061ad688c..87c5b7061f89 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.h +++ b/Source/ComputationNetworkLib/ComputationNetwork.h @@ -12,6 +12,7 @@ #include "ComputationNode.h" #include "ScriptableObjects.h" +#include "ComputationEnvironment.h" #include #include @@ -43,10 +44,11 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb // construction // ----------------------------------------------------------------------- - ComputationNetwork() - : m_randomSeedOffset(0), - m_isCompiled(false), - m_pMBLayout(make_shared()) + ComputationNetwork() : + m_randomSeedOffset(0), + m_isCompiled(false), + m_pMBLayout(make_shared()), + m_environment(make_shared()) { } ComputationNetwork(DEVICEID_TYPE deviceId) @@ -283,6 +285,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb // this counts the actual number of frames in a minibatch (not counting gaps in parallel sequences) // TODO: Instead of passing numAllSamples in here, we should determine it from the inputs in case of no layout. Or simply forbid this case. + // BUGBUG: With variable-length sequences, this can no longer be a network method. size_t GetNumSamplesWithLabel(const size_t numAllSamples) const { if (m_pMBLayout) @@ -329,7 +332,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb void ReplaceLeafNode(wstring oldNodeName, ComputationNodeBasePtr newNode); void ReplaceFinalCriterionNode(wstring oldNodeName, ComputationNodeBasePtr newNode); void AddFeatureNode(ComputationNodeBasePtr featureNode); - void RemoveFeatureNode(ComputationNodeBasePtr featureNode); + ComputationNodeBasePtr RemoveFeatureNode(ComputationNodeBasePtr featureNode); void SetLearnableNodesBelowLearningRateMultiplier(const float learningRateMultiplier, const ComputationNodeBasePtr& rootNode = nullptr); void SetBatchNormalizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode = nullptr); @@ -566,15 +569,14 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb // TODO: move these close to where they are used // add a node to m_nameToNodeMap[], which is our node holder + // This only adds the node to the network's node set, without considering linkage. // Duplicate node names are rejected. ComputationNodeBasePtr AddNodeToNet(const ComputationNodeBasePtr& nodePtr) { - // found - // TODO: use .insert() and test result.second == false means not inserted since already exists - if (m_nameToNodeMap.find(nodePtr->NodeName()) != m_nameToNodeMap.end()) - RuntimeError("Duplicated computation node name."); - - m_nameToNodeMap[nodePtr->NodeName()] = nodePtr; + auto result = m_nameToNodeMap.insert(make_pair(nodePtr->NodeName(), nodePtr)); + if (!result.second) + RuntimeError("AddNodeToNet: Duplicated computation node name."); + nodePtr->SetEnvironment(m_environment); return nodePtr; // allows e.g. return AddNodeToNet(New...); } // TODO: not very nice--need to fix way more outside to get this right @@ -592,6 +594,27 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb // return nodePtr; // allows e.g. return AddNodeToNetAndAttachInputs(New..., inputs); } + // add a node to the network unless it's already there + ComputationNodeBasePtr AddNodeToNetIfNotYet(const ComputationNodeBasePtr& nodePtr) + { + auto result = m_nameToNodeMap.insert(make_pair(nodePtr->NodeName(), nodePtr)); + if (!result.second && result.first->second != nodePtr) // if there's already one under this name, it better be nodePtr + RuntimeError("AddNodeToNetIfNotYet: Duplicated computation node name."); + nodePtr->SetEnvironment(m_environment); // (note: redundant if already part of the network) + return nodePtr; // allows e.g. return AddNodeToNet(New...); + } + + // remove a node from the network's node set + // This does NOT update any links referencing it, or node groups. + // TODO: We should verify that indeed this node is not referenced by other nodes or node groups, + // nor that this node references any node inside the network. + ComputationNodeBasePtr RemoveNodeFromNet(const ComputationNodeBasePtr& node) + { + node->SetEnvironment(nullptr); + m_nameToNodeMap.erase(node->NodeName()); + return node; + } + public: // ----------------------------------------------------------------------- // evaluation @@ -847,7 +870,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb m_randomSeedOffset = value; } -protected: +private://protected: DEVICEID_TYPE m_deviceId; // TODO: is this shared by all nodes? unsigned long m_randomSeedOffset; @@ -869,9 +892,12 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb // used for sentence boundary information passed from reader to reset RNN state // specify how the minibatch is packed for each sample - // TODO: This will change once we allow for multiple inconsistent layouts. + // BUGBUG: With variable-length inconsistent layouts, this can no longer be a network property. MBLayoutPtr m_pMBLayout; // note that this must be installed before doing anything that needs it (default leaves a nullptr) + // environment information that nodes may want to inquire, e.g. to know whether we are training + ComputationEnvironmentPtr m_environment; + private: // ----------------------------------------------------------------------- // the following members are all result of post-processing by CompileNetwork() @@ -908,8 +934,5 @@ template class Matrix; // TODOs: // - automatic inference of time window w.r.t. delay nodes (and related nodes such as a temporal pooling) // - have overrides of RuntimeError etc. in ComputationNode, which prepend the error string with the node name and operation -// - code prettification: -// - sort all node implementations' methods into the same order; esp, ForwardProp() comes before partial -// - sort important nodes first; move unused/experimental nodes into source files named accordingly -} } } +}}} diff --git a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp index 5707878e4dec..e022291bf408 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp @@ -103,30 +103,26 @@ void ComputationNetwork::CopyInputs(const std::wstring fromName, std::wstring to // RenameNode - Rename a node to another name // nodeNameOrig - original node name // nodeNameNew - new node name -void ComputationNetwork::RenameNode(const std::wstring& nodeNameOrig, const std::wstring& nodeNameNew) +void ComputationNetwork::RenameNode(const std::wstring& nodeNameOrig, const std::wstring& newNodeName) { - InvalidateCompiledNetwork(); - - ComputationNodeBasePtr nodeToRename = GetNodeFromName(nodeNameOrig); + RenameNode(GetNodeFromName(nodeNameOrig), newNodeName); +} - auto iter = m_nameToNodeMap.find(nodeNameNew); +void ComputationNetwork::RenameNode(ComputationNodeBasePtr node, const std::wstring& newNodeName) +{ + // make sure the new name is not already used + auto iter = m_nameToNodeMap.find(newNodeName); if (iter != m_nameToNodeMap.end()) // found RuntimeError("RenameNode: Target name already exists."); - // rename the node and update the mapping table - nodeToRename->SetNodeName(nodeNameNew); - m_nameToNodeMap.erase(nodeNameOrig); - m_nameToNodeMap[nodeNameNew] = nodeToRename; -} + InvalidateCompiledNetwork(); -void ComputationNetwork::RenameNode(ComputationNodeBasePtr node, const std::wstring& newNodeName) -{ - // TODO: check if new name exists - m_nameToNodeMap.erase(node->NodeName()); - node->SetNodeName(newNodeName); - AddNodeToNet(node); + RemoveNodeFromNet(node); // take it out remporarily + node->SetNodeName(newNodeName); // change the name + AddNodeToNet(node); // and put it back } +// deletes a node from the network including setting all input links to it to null, and removing it from the node groups void ComputationNetwork::DeleteNode(const std::wstring& nodeName) { InvalidateCompiledNetwork(); @@ -165,20 +161,24 @@ void ComputationNetwork::DeleteNode(const std::wstring& nodeName) // Note: the necessary update of m_allSEQNodes is hanlded by the InvalidateCompiledNetwork() call above // delete the node itself - m_nameToNodeMap.erase(nodeName); // this will deref the node and possibly deallocate it + RemoveNodeFromNet(nodeToDelete); } -// change the node associated with nodeName to newNode; used in the KL-reg based adaptation to reduce feature copy -// need to update all the mappings as well childrens +// replace a named node by newNode of the same type under the same name, including moving over all network links +// This is used in the KL-reg based adaptation to reduce feature copy +// need to update all the mappings as well childrens. void ComputationNetwork::ChangeNode(wstring nodeName, ComputationNodeBasePtr newNode) { - InvalidateCompiledNetwork(); - ComputationNodeBasePtr oldNode = GetNodeFromName(nodeName); + + if (newNode->NodeName() != nodeName) // TODO: This was not tested for earlier; I hope no code depends on this. + InvalidArgument("ChangeNode: newNode must have the same name as the old node."); if (oldNode->OperationName() != newNode->OperationName()) - InvalidArgument("newNode must have the same type as the old node."); + InvalidArgument("ChangeNode: newNode must have the same type as the old node."); - // change children + InvalidateCompiledNetwork(); + + // change all nodes to have old node as input to point to the new node instead for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); nodeIter++) { ComputationNodeBasePtr node = nodeIter->second; @@ -187,12 +187,18 @@ void ComputationNetwork::ChangeNode(wstring nodeName, ComputationNodeBasePtr new node->SetInput(i, newNode); } - // change name map - m_nameToNodeMap[nodeName] = newNode; + // change all inputs of this new node to share the old one's inputs for (int i = 0; i < oldNode->GetNumInputs(); i++) - newNode->SetInput(i, oldNode->GetInputs()[i]); + { + newNode->SetInput(i, oldNode->GetInputs()[i]); // TODO: use AttachInput()? + //oldNode->SetInput(i, nullptr); // BUGBUG: old node should no longer point into the network + } + + // replace the node in the network + RemoveNodeFromNet(oldNode); + AddNodeToNet(newNode); - // change other maps + // also update node groups for (auto groupIter : GetAllNodeGroups()) { auto& group = *groupIter; @@ -204,13 +210,17 @@ void ComputationNetwork::ChangeNode(wstring nodeName, ComputationNodeBasePtr new // replace the old node with the current node, assuming the old node is a leaf node // need to update those nodes who use oldNode as their child +// TODO: Can this be called with a node that's already part of the network? This is currently allowed, but should it? +// BUGBUG: Seems ChangeNode() also updates node groups. Why doesn't this function? +// BUGBUG: What if newNode is the one referenced by oldNodeName? +// BUGBUG: Or what if an unrelated node of the same name exists? void ComputationNetwork::ReplaceLeafNode(wstring oldNodeName, ComputationNodeBasePtr newNode) { InvalidateCompiledNetwork(); ComputationNodeBasePtr oldNode = GetNodeFromName(oldNodeName); - // change the input of those nodes whose child is oldNode + // relink the input of those nodes whose child is oldNode to point to the new one instead for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); nodeIter++) { ComputationNodeBasePtr node = nodeIter->second; @@ -218,18 +228,19 @@ void ComputationNetwork::ReplaceLeafNode(wstring oldNodeName, ComputationNodeBas if (node->GetInputs()[i] == oldNode) node->SetInput(i, newNode); } - m_nameToNodeMap[newNode->GetName()] = newNode; - // now the old node becomes a orphan node , remove it - DeleteNode(oldNodeName); - // RemoveOrphanNode(oldNode); + // add the new, remove the old + AddNodeToNetIfNotYet(newNode); + DeleteNode(oldNodeName); // TODO: can this just be RemoveNodeFromNet()? } +// add a new criterion node and at the same time orphan the previous one (it won't be removed) +// BUGBUG: Can this operate on both new and existing nodes? void ComputationNetwork::ReplaceFinalCriterionNode(wstring oldNodeName, ComputationNodeBasePtr newNode) { InvalidateCompiledNetwork(); - // Checks if the node is a criterion node. + // checks if the node is a criterion node int index = -1; for (int i = 0; i < m_finalCriteria.size(); ++i) { @@ -242,32 +253,32 @@ void ComputationNetwork::ReplaceFinalCriterionNode(wstring oldNodeName, Computat if (index == -1) RuntimeError("ReplaceFinalCriterionNode: the node to be replaced is not a criterion node."); - // Replaces children. + // replace children for (int i = 0; i < newNode->GetNumInputs(); ++i) { if (m_nameToNodeMap.find(newNode->GetInputs()[i]->NodeName()) == m_nameToNodeMap.end()) RuntimeError("Child node does not exist."); newNode->SetInput(i, m_nameToNodeMap[newNode->GetInputs()[i]->NodeName()]); + // TODO: Remove the strange indirection through nameToNodeMap, just use the ptr directly? } - // Addes it to criterion node list. + // add it to the network + AddNodeToNetIfNotYet(newNode); + + // add it to criterion node list m_finalCriteria[index] = newNode; - m_nameToNodeMap[newNode->NodeName()] = newNode; } void ComputationNetwork::AddFeatureNode(ComputationNodeBasePtr featureNode) { InvalidateCompiledNetwork(); - wstring nodeName = featureNode->NodeName(); - if (NodeNameExists(nodeName)) - RuntimeError("AddFeatureNode: feature node already exists."); - m_nameToNodeMap[nodeName] = featureNode; + AddNodeToNet(featureNode); m_features.push_back(featureNode); } -// We only remove the node, not delete it. -void ComputationNetwork::RemoveFeatureNode(ComputationNodeBasePtr featureNode) +// We only remove the node from the net, not destruct it. +ComputationNodeBasePtr ComputationNetwork::RemoveFeatureNode(ComputationNodeBasePtr featureNode) { InvalidateCompiledNetwork(); @@ -275,7 +286,7 @@ void ComputationNetwork::RemoveFeatureNode(ComputationNodeBasePtr featureNode) if (!NodeNameExists(nodeName)) RuntimeError("RemoveFeatureNode: feature node does not exist."); - // Removes links. + // removes links for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); ++nodeIter) { ComputationNodeBasePtr node = nodeIter->second; @@ -295,7 +306,7 @@ void ComputationNetwork::RemoveFeatureNode(ComputationNodeBasePtr featureNode) if (search != m_features.end()) m_features.erase(search); - m_nameToNodeMap.erase(nodeName); + return RemoveNodeFromNet(featureNode); } // sets m_learningRateMultiplier in all LearnableParameters feeding into the passed rootNode @@ -360,4 +371,5 @@ void ComputationNetwork::SetBatchNormalizationNodesBelowEvalMode(const bool eval } } } -} } } + +}}} diff --git a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj index 2c08f30899a9..82e7df227d79 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj +++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj @@ -137,6 +137,7 @@ + diff --git a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters index ac39ae84bf79..214a243a350a 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters +++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters @@ -126,6 +126,9 @@ Nodes + + Environment + @@ -149,5 +152,8 @@ {7d838fa4-b5a1-4b8a-b37d-823fb026055b} + + {ed685e39-b7dd-4546-a865-149664fa71a4} + \ No newline at end of file diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index a7a9c521db22..c80866c6e5d5 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -11,6 +11,7 @@ #include "Sequences.h" #include "TensorShape.h" #include "MatrixPool.h" +#include "ComputationEnvironment.h" #include #include @@ -610,6 +611,14 @@ protected: public: // ...the following should be protected, but nodes inquire ab // return true if the node's value should be computed before the normal training. e.g., mean and invStd of input features. virtual bool /*IComputationNode::*/ RequiresPreCompute() const { return false; } + const ComputationEnvironment& Environment() + { + if (!m_environment) + LogicError("Environment: No environment has been set."); + return *m_environment; + } + void SetEnvironment(ComputationEnvironmentPtr environment) { m_environment = environment; } + // ----------------------------------------------------------------------- // validation // ----------------------------------------------------------------------- @@ -816,6 +825,10 @@ protected: public: // ...the following should be protected, but nodes inquire ab TensorShape m_sampleLayout; // sample layout MBLayoutPtr m_pMBLayout; + // environment information + // This structure is shared with the ComputationNetwork that this node lives in + ComputationEnvironmentPtr m_environment; + // flags related to gradient propagation float m_learningRateMultiplier; // update parameters? Only used for LearnableParameters. --TODO: Should we make this a member of LearnableParameters actually? And require a type cast? Currently it is read out for all leaves. bool m_gradientInitialized; // indicates whether the gradient matrix has been resized and initialized to 0 @@ -1962,4 +1975,5 @@ class BinaryElementWiseNode : public ComputationNode, public NumInputs #define UsingBinaryElementwiseNodeBaseMembers UsingComputationNodeMembersBoilerplate; #pragma endregion base computation class -} } } + +}}} diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h index 39b9d74ca0ef..f0387c6f7108 100644 --- a/Source/Math/GPUMatrix.h +++ b/Source/Math/GPUMatrix.h @@ -112,6 +112,7 @@ class MATH_API GPUMatrix : public BaseMatrix using BaseMatrix::GetArray; using BaseMatrix::GetNumRows; using BaseMatrix::GetNumCols; + using BaseMatrix::VerifySize; private: static cublasHandle_t s_cuHandle[MaxGpus]; From 3cd0308aa127694f75a99532f421402c7675bae5 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Sun, 13 Mar 2016 15:34:36 -0700 Subject: [PATCH 15/26] new node EnvironmentInput, which exposes the isTraining property to graph operations --- .../CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs | 1 + .../ComputationEnvironment.h | 45 +++++++-- .../ComputationNetwork.h | 21 +++-- .../ComputationNetworkBuilder.cpp | 1 + .../ComputationNetworkEvaluation.cpp | 3 + .../ComputationNetworkScripting.cpp | 9 +- .../ComputationNetworkLib/ComputationNode.h | 3 +- .../InputAndParamNodes.h | 94 ++++++++++++++++++- .../ComputationNetworkLib/PreComputeNodes.h | 2 + Source/ComputationNetworkLib/TrainingNodes.h | 20 ++-- Source/SGDLib/SGD.cpp | 14 ++- Source/SGDLib/SimpleEvaluator.h | 2 + Source/SGDLib/SimpleOutputWriter.h | 4 + 13 files changed, 173 insertions(+), 46 deletions(-) diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs index ece5ffd5412f..94108f9d3853 100644 --- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs +++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs @@ -36,6 +36,7 @@ Input(dims, tag='feature') = new ComputationNode [ operation = 'InputValue' ; sh SparseInput(dims, tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ] ImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = true /*plus the function args*/ ] SparseImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ] +EnvironmentInput(propertyName, tag='') = new ComputationNode [ operation = 'EnvironmentInput' /*plus the function args*/ ] Constant(val, rows = 1, cols = 1, tag='') = Parameter(rows, cols, learningRateMultiplier = 0, init = 'fixedValue', value = val) PastValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ] FutureValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ] diff --git a/Source/ComputationNetworkLib/ComputationEnvironment.h b/Source/ComputationNetworkLib/ComputationEnvironment.h index d860208d3431..58c2c8e2f301 100644 --- a/Source/ComputationNetworkLib/ComputationEnvironment.h +++ b/Source/ComputationNetworkLib/ComputationEnvironment.h @@ -6,27 +6,56 @@ #pragma once #include "Basics.h" - #include namespace Microsoft { namespace MSR { namespace CNTK { // =========================================================================== -// ComputationEnvironment -- computation graph and operations +// ComputationEnvironment -- global network properties of interest to nodes // =========================================================================== +// mode that the network is currently used in, which affects node behavior enum class NetworkOperationMode { - unspecified, - training, - inferring, - precomputing + training, // training mode specifically means nodes should behave like training (e.g. Dropout should be active) + inferring, // inferring (e.g. BatchNorm should not update mean estimates) + preComputing // precomputation is a part of training where most nodes should behave like they are inferring }; +// class to store global properties of the network that are of interest to the nodes +// For example, a network can be in 'training' or 'inference' mode, which affects what nodes like Dropout and BN do, +// or what the seq-2-seq decoder feedback signal is. struct ComputationEnvironment { - NetworkOperationMode networkOperationMode = NetworkOperationMode::unspecified; + // networkOperationMode tells whether we are training or inferring, which affects some nodes' behavior + NetworkOperationMode networkOperationMode = NetworkOperationMode::inferring; // by default, a network is always able to infer + bool IsTraining() const { return networkOperationMode == NetworkOperationMode::training; } + bool IsPreComputing() const { return networkOperationMode == NetworkOperationMode::preComputing; } + + // more properties should be added here as needed +}; +typedef std::shared_ptr ComputationEnvironmentPtr; + +// RAII wrapper for setting and reverting ComputationEnvironment::networkOperationMode +// E.g. ScopedNetworkOperationMode modeGuard(net, NetworkOperationMode::training); +// will set the mode until the end of the scope, and then revert to its old value automatically. +class ScopedNetworkOperationMode +{ + ComputationEnvironment& m_environment; + NetworkOperationMode m_previousNetworkOperationMode; + void operator=(const ScopedNetworkOperationMode&) = delete; +public: + template // using template to avoid dependency + ScopedNetworkOperationMode(const std::shared_ptr& net, NetworkOperationMode networkOperationMode) : + m_environment(net->Environment()) + { + m_previousNetworkOperationMode = m_environment.networkOperationMode; + m_environment.networkOperationMode = networkOperationMode; + } + ~ScopedNetworkOperationMode() // destructor restores the previous mode + { + m_environment.networkOperationMode = m_previousNetworkOperationMode; + } }; -typedef shared_ptr ComputationEnvironmentPtr; }}} diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h index 87c5b7061f89..7dd540e14dcd 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.h +++ b/Source/ComputationNetworkLib/ComputationNetwork.h @@ -385,10 +385,17 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb return nodes; } + // ----------------------------------------------------------------------- + // environment properties + // ----------------------------------------------------------------------- + + ComputationEnvironment& Environment() const { return *m_environment; } + // ----------------------------------------------------------------------- // functions to pass on specific SGD options to nodes // ----------------------------------------------------------------------- + // TODO: Why are all these static, but then take a network as the first argument? --> make them class members template static void SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, unsigned long& dropOutSeed); @@ -398,7 +405,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb template static void SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, - const double& hsmoothingWeight, + const double& hsmoothingWeight, // TODO: Why are all these passed by reference? const double& frameDropThresh, const bool& doreferencealign, const double& amf = 14.0f, @@ -406,6 +413,7 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb const double& wp = 0.0f, const double& bMMIfactor = 0.0f, const bool& sMBR = false); + static void SetMaxTempMemSizeForCNN(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const size_t maxTempMemSizeInSamples); // ----------------------------------------------------------------------- @@ -474,12 +482,6 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb return m_nameToNodeMap.size(); } - // TODO: could be a dup - std::map& GetNameToNodeMap() // specially for ExperimentalNetworkBuilder; don't use this otherwise - { - return m_nameToNodeMap; - } - std::vector GetAllNodes() const { std::vector nodes; @@ -595,13 +597,14 @@ class ComputationNetwork : public ScriptableObjects::Object, public ScriptableOb } // add a node to the network unless it's already there - ComputationNodeBasePtr AddNodeToNetIfNotYet(const ComputationNodeBasePtr& nodePtr) + // Returns false if the node was already there. + bool AddNodeToNetIfNotYet(const ComputationNodeBasePtr& nodePtr) { auto result = m_nameToNodeMap.insert(make_pair(nodePtr->NodeName(), nodePtr)); if (!result.second && result.first->second != nodePtr) // if there's already one under this name, it better be nodePtr RuntimeError("AddNodeToNetIfNotYet: Duplicated computation node name."); nodePtr->SetEnvironment(m_environment); // (note: redundant if already part of the network) - return nodePtr; // allows e.g. return AddNodeToNet(New...); + return result.second; } // remove a node from the network's node set diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp index 2fed249ca72a..72eb3792b4c6 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp @@ -49,6 +49,7 @@ static shared_ptr> CreateStandardNode(const std::wstri else if (nodeType == OperationNameOf(DropoutNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(DummyCriterionNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(ElementTimesNode)) return New>(forward<_Types>(_Args)...); + else if (nodeType == OperationNameOf(EnvironmentInputNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(ErrorPredictionNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(ExpNode)) return New>(forward<_Types>(_Args)...); else if (nodeType == OperationNameOf(FutureValueNode)) return New>(forward<_Types>(_Args)...); diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp index 957c257da5fa..7e1c3de72218 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp @@ -66,6 +66,9 @@ static bool SetGradientToScalarOne(ComputationNodeBasePtr nodep) // - Backprop() for the training criterion void ComputationNetwork::Backprop(const ComputationNodeBasePtr rootNode) // training criterion to compute the gradients for { + if (!Environment().IsTraining()) + LogicError("Backprop: Requires network is to be in training mode."); + // reset all gradients to zero (actually, internally, this is lazy, but we don't care here) ZeroGradients(rootNode); diff --git a/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp b/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp index e7f1be1803a9..b11f944cc5c7 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp @@ -65,12 +65,9 @@ ComputationNetwork::ComputationNetwork(const IConfigRecordPtr configp) workList.pop_front(); // add to set - let res = m_nameToNodeMap.insert(make_pair(node->NodeName(), node)); - if (!res.second) // not inserted: we already got this one - if (res.first->second == node) - continue; // the same - else // oops, a different node with the same name - LogicError("ComputationNetwork: multiple nodes with the same NodeName() '%ls'", node->NodeName().c_str()); + let wasAdded = AddNodeToNetIfNotYet(node); + if (!wasAdded) // node already there (above will fail if there is a different node with the same name) + continue; // If node derives from ILateAttachingNode() then it has unresolved inputs. Resolve them now. // This may generate a whole new load of nodes, including nodes which in turn have late init. diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index c80866c6e5d5..976c83cd72db 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -611,7 +611,7 @@ protected: public: // ...the following should be protected, but nodes inquire ab // return true if the node's value should be computed before the normal training. e.g., mean and invStd of input features. virtual bool /*IComputationNode::*/ RequiresPreCompute() const { return false; } - const ComputationEnvironment& Environment() + const ComputationEnvironment& Environment() const { if (!m_environment) LogicError("Environment: No environment has been set."); @@ -1799,6 +1799,7 @@ protected: using Base::DetermineElementwiseTensorRank; \ using Base::DumpNodeInfo; \ using Base::EnumerateNodes; \ + using Base::Environment; \ using Base::ForwardProp; \ using Base::GetAsMatrixNumCols; \ using Base::GetAsMatrixNumRows; \ diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index 4e015b4239ab..e836e1f63279 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -253,11 +253,13 @@ class LearnableParameter : public ComputationNode, public NumInputs<0> { } - virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override + virtual void /*ComputationNode::*/ ForwardProp(const FrameRange&) override { } - virtual void /*ComputationNode::*/ ForwardProp(const FrameRange&) override + + virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override { + LogicError("%ls %ls operation is a leaf node. BackpropTo() should never be called.", NodeName().c_str(), OperationName().c_str()); } virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override @@ -410,10 +412,12 @@ class InputValueBase : public ComputationNode, public NumInputs<0> virtual void /*ComputationNode::*/ ForwardProp(const FrameRange&) override { + // we have been filled by the Reader } + virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) { - LogicError("InputValueBase::BackpropTo() should never be called."); + LogicError("%ls is a leaf node. BackpropTo() should never be called.", NodeName().c_str()); } virtual void DumpNodeInfo(const bool printValues, const bool printMetadata, File& fstream) const override @@ -510,6 +514,90 @@ class SparseInputValue : public InputValueBase template class SparseInputValue; template class SparseInputValue; +// ----------------------------------------------------------------------- +// EnvironmentInput (propertyName) -- read out environment properties +// Such as whether we are currently training or evaluating, which can affect +// behavior, such as seq-2-seq decoding. +// ----------------------------------------------------------------------- + +template +class EnvironmentInputNode : public ComputationNodeNonLooping, public NumInputs<0> +{ + typedef ComputationNodeNonLooping Base; UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() { return L"EnvironmentInput"; } + +public: + EnvironmentInputNode(DEVICEID_TYPE deviceId, const wstring& name, const wstring& propertyName = L"") : + Base(deviceId, name), m_propertyName(propertyName) + { + } + EnvironmentInputNode(const ScriptableObjects::IConfigRecordPtr configp) + : EnvironmentInputNode(configp->Get(L"deviceId"), L"", configp->Get(L"propertyName")) + { + } + + virtual void Save(File& fstream) const override + { + Base::Save(fstream); + fstream << m_propertyName; + } + + virtual void Load(File& fstream, size_t modelVersion) override + { + Base::Load(fstream, modelVersion); + fstream >> m_propertyName; + } + +private: + ElemType ReadOutVariable() const + { + const auto& e = Environment(); + if (m_propertyName == L"isTraining") + return (ElemType)e.IsTraining(); + else + InvalidArgument("EnvironmentInput: There is no environment property '%ls'", m_propertyName.c_str()); + } + +public: + // TODO: Noone else overrides this method. So is this the right mechanism? + // On the other hand, we are also the only leaf that needs to update itself. + virtual bool /*ComputationNodeBase::*/ IsOutOfDateWrtInputs() const override { return true; } + + virtual void /*IComputationNode::*/ BeginForwardProp() override + { + // We are a leaf, so UpdateFunctionValuesSize() won't be called for us. + UpdateFunctionValuesSize(); + Base::BeginForwardProp(); + } + + virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override + { + ElemType val = ReadOutVariable(); + Value().VerifySize(1, 1); + Value().SetValue(val); + } + + virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override + { + LogicError("%ls %ls operation is a leaf node. BackpropTo() should never be called.", NodeName().c_str(), OperationName().c_str()); + } + virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } + + virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override + { + ReadOutVariable(); // read out the value once, with the purpose of validating the variableName + Base::Validate(isFinalValidationPass); + // this node does not hold mini-batch data + m_pMBLayout = nullptr; + // for now, anything this node returns is a scalar + SetDims(TensorShape(1), false); + } + +private: + wstring m_propertyName; +}; + // ----------------------------------------------------------------------- // LookupTableNode (embedding matrix, bag-of-word representation of the inputs) // Implements an embedding. The input vector can consist of multiple stacked diff --git a/Source/ComputationNetworkLib/PreComputeNodes.h b/Source/ComputationNetworkLib/PreComputeNodes.h index 96ae72479476..6a8e5e8c8532 100644 --- a/Source/ComputationNetworkLib/PreComputeNodes.h +++ b/Source/ComputationNetworkLib/PreComputeNodes.h @@ -46,6 +46,8 @@ class PreComputedNodeBase : public ComputationNodeNonLooping /*ComputationNode*/ // This is used for resetting and updating from accumulators. virtual void /*IPreComputeNode::*/ MarkComputed(const bool hasComputed) override { + if (!Environment().IsPreComputing()) + LogicError("MarkComputed: Network must be in preComputing mode."); m_hasComputed = hasComputed; } diff --git a/Source/ComputationNetworkLib/TrainingNodes.h b/Source/ComputationNetworkLib/TrainingNodes.h index 1450f1787dae..9d988e212cbc 100644 --- a/Source/ComputationNetworkLib/TrainingNodes.h +++ b/Source/ComputationNetworkLib/TrainingNodes.h @@ -370,12 +370,8 @@ template class CrossEntropyNode; template class MatrixL1RegNode : public ComputationNodeNonLooping /*ComputationNode*/, public NumInputs<1> { - typedef ComputationNodeNonLooping Base; - UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() - { - return L"MatrixL1Reg"; - } + typedef ComputationNodeNonLooping Base; UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() { return L"MatrixL1Reg"; } public: DeclareConstructorFromConfigWithNumInputs(MatrixL1RegNode); @@ -384,7 +380,7 @@ class MatrixL1RegNode : public ComputationNodeNonLooping /*ComputationNode*/GetMBLayout()); assert(inputIndex == 0); @@ -463,12 +459,8 @@ template class MatrixL1RegNode; template class MatrixL2RegNode : public ComputationNodeNonLooping /*ComputationNode*/, public NumInputs<1> { - typedef ComputationNodeNonLooping Base; - UsingComputationNodeMembersBoilerplate; - static const std::wstring TypeName() - { - return L"MatrixL2Reg"; - } + typedef ComputationNodeNonLooping Base; UsingComputationNodeMembersBoilerplate; + static const std::wstring TypeName() { return L"MatrixL2Reg"; } public: DeclareConstructorFromConfigWithNumInputs(MatrixL2RegNode); @@ -477,7 +469,7 @@ class MatrixL2RegNode : public ComputationNodeNonLooping /*ComputationNode*/GetMBLayout()); assert(inputIndex == 0); diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp index 82404e2f2b5b..5b086aa709a7 100644 --- a/Source/SGDLib/SGD.cpp +++ b/Source/SGDLib/SGD.cpp @@ -732,6 +732,8 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, /*in/out*/ size_t& totalSamplesSeen, std::string prefixMsg) { + ScopedNetworkOperationMode modeGuard(net, NetworkOperationMode::training); + double totalTimeInMBs = 0; // use double since timer has sub-microsecond time resolution double epochCriterionLastMBs = 0; @@ -764,8 +766,6 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, { m_pMASGDHelper->OnEpochStart(learnableNodes); } - - std::vector*> learnParamsGradients; if (useGradientAggregation) @@ -1287,13 +1287,15 @@ bool SGD::PreCompute(ComputationNetworkPtr net, fprintf(stderr, "\tNodeName: %ls\n", (node->NodeName()).c_str()); // compute + ScopedNetworkOperationMode modeGuard(net, NetworkOperationMode::preComputing); + // trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0 , requestDataSize); // trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0 , m_epochSize); // only based on one epoch // [1/12/2015 erw] to support large dataset, we usually partition whole dataset into several epoch's, // so we need to use all the data to do precomputing if (m_useAllDataForPreComputedNode) // using all the data trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0); - else // using only one epoch + else // using only one epoch. Note: One epoch is often enough for feature mean/stddev, but not for estimating priors. trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0, m_epochSize); net->StartEvaluateMinibatchLoop(nodes); @@ -2179,10 +2181,12 @@ bool SGD::GradientCheck(ComputationNetworkPtr net, const std::list& learnableNodes, int npos) { - vector errMsgs; + ScopedNetworkOperationMode modeGuard(net, NetworkOperationMode::training); net->StartEvaluateMinibatchLoop(criterionNodes[npos]); + vector errMsgs; // TODO: These are created but actually not returned, only their count is checked. + // gradient checking for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++) { @@ -2264,7 +2268,7 @@ bool SGD::GradientCheck(ComputationNetworkPtr net, } } - return errMsgs.size() == 0; + return errMsgs.empty(); } template class SGD; diff --git a/Source/SGDLib/SimpleEvaluator.h b/Source/SGDLib/SimpleEvaluator.h index be31c0fae794..0e734dda0c5d 100644 --- a/Source/SGDLib/SimpleEvaluator.h +++ b/Source/SGDLib/SimpleEvaluator.h @@ -47,6 +47,8 @@ class SimpleEvaluator // returns evaluation node values per sample determined by evalNodeNames (which can include both training and eval criterion nodes) vector Evaluate(IDataReader* dataReader, const vector& evalNodeNames, const size_t mbSize, const size_t testSize = requestDataSize) { + ScopedNetworkOperationMode modeGuard(m_net, NetworkOperationMode::inferring); + // determine nodes to evaluate std::vector evalNodes; diff --git a/Source/SGDLib/SimpleOutputWriter.h b/Source/SGDLib/SimpleOutputWriter.h index e6a8b057287b..812adbf4cd11 100644 --- a/Source/SGDLib/SimpleOutputWriter.h +++ b/Source/SGDLib/SimpleOutputWriter.h @@ -87,6 +87,8 @@ class SimpleOutputWriter void WriteOutput(IDataReader& dataReader, size_t mbSize, IDataWriter& dataWriter, const std::vector& outputNodeNames, size_t numOutputSamples = requestDataSize, bool doUnitTest = false) { + ScopedNetworkOperationMode modeGuard(m_net, NetworkOperationMode::inferring); + std::vector outputNodes = DetermineOutputNodes(outputNodeNames); std::vector inputNodes = DetermineInputNodes(outputNodes); @@ -190,6 +192,8 @@ class SimpleOutputWriter // TODO: Remove code dup with above function by creating a fake Writer object and then calling the other function. void WriteOutput(IDataReader& dataReader, size_t mbSize, std::wstring outputPath, const std::vector& outputNodeNames, const WriteFormattingOptions & formattingOptions, size_t numOutputSamples = requestDataSize) { + ScopedNetworkOperationMode modeGuard(m_net, NetworkOperationMode::inferring); + std::vector outputNodes = DetermineOutputNodes(outputNodeNames); std::vector inputNodes = DetermineInputNodes(outputNodes); From abfc23e041b89f2c2ddc32ec0e79fba58ebba4c4 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Mon, 14 Mar 2016 20:01:31 -0700 Subject: [PATCH 16/26] writeWordAndClassInfo can now also writes a regular labelMapping file; WriteMinibatchWithFormatting() now supports packed sequences --- Source/ActionsLib/OtherActions.cpp | 111 ++++++++++-------- .../ComputationNetworkLib/ComputationNode.cpp | 68 +++++------ .../LMSequenceReader/SequenceReader.cpp | 2 + 3 files changed, 99 insertions(+), 82 deletions(-) diff --git a/Source/ActionsLib/OtherActions.cpp b/Source/ActionsLib/OtherActions.cpp index 39cb6ac45ccc..1d7629600341 100644 --- a/Source/ActionsLib/OtherActions.cpp +++ b/Source/ActionsLib/OtherActions.cpp @@ -205,13 +205,18 @@ template void DoParameterSVD(const ConfigParameters& config); // DoWriteWordAndClassInfo() - implements CNTK "writeWordAndClass" command // =========================================================================== -// BUGBUG: This should compare both elements (first one is the word name). This current version leads to different sorting and thus class definitions with VS and gcc. +// compare functor to for sorting by the second element of a pair +// TODO: just use a lambda template struct compare_second { bool operator()(const T& lhs, const T& rhs) const { - return lhs.second < rhs.second; + // BUGBUG: This should compare both elements (first one is the word name). This current version leads to different sorting and thus class definitions with VS and gcc. + //if (lhs.second == rhs.second) // if second element + // return lhs.first < rhs.first; + //else + return lhs.second < rhs.second; } }; @@ -242,8 +247,9 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config) int cutoff = config(L"cutoff", "1"); string inputFile = config(L"inputFile"); // training text file without - string outputVocabFile = config(L"outputVocabFile"); - string outputWord2Cls = nbrCls > 0 ? config(L"outputWord2Cls") : string(); + string outputMappingFile = config(L"outputMappingFile", ""); // if specified then write a regular mapping file + string outputVocabFile = config(L"outputVocabFile"); + string outputWord2Cls = nbrCls > 0 ? config(L"outputWord2Cls") : string(); string outputCls2Index = nbrCls > 0 ? config(L"outputCls2Index") : string(); string unkWord = config(L"unk", ""); @@ -254,14 +260,16 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config) if (beginSequence.empty() || endSequence.empty()) InvalidArgument("Please specify parameters 'beginSequence' and 'endSequence'."); - std::cerr << "Vocabulary file: " << outputVocabFile << std::endl; + if (!outputMappingFile.empty()) + cerr << "Mapping file --> " << outputVocabFile << endl; + cerr << "Vocabulary file --> " << outputVocabFile << endl; if (nbrCls > 0) { - std::cerr << "Word-to-class map: " << outputWord2Cls << std::endl; - std::cerr << "Class-to-index map: " << outputCls2Index << std::endl; + cerr << "Word-to-class map --> " << outputWord2Cls << endl; + cerr << "Class-to-index map --> " << outputCls2Index << endl; } - std::cerr << std::endl; - + cerr << endl; + // check whether we are already up-to-date bool makeMode = config(L"makeMode", true); if (makeMode) @@ -274,7 +282,7 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config) } if (done) { - std::cerr << "All output files up to date.\n"; + cerr << "All output files up to date.\n"; return; } } @@ -285,19 +293,12 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config) ifstream fp(inputFile.c_str()); // TODO: use class File, as to support pipes if (!fp) RuntimeError("Failed to open input file: %s", inputFile.c_str()); - cerr << "Reading input file inputFile: " << inputFile << std::endl; + cerr << "Reading input file inputFile: " << inputFile << endl; if (nbrCls > 0) cls2idx.Resize(nbrCls, 1); -#if 1 - std::unordered_map v_count; -#else - // TODO: For unknown reasons, this gives a very different result (PPL of 500 instead of 190). Should be tracked down. - std::map v_count; - v_count[beginSequence] = 0; // get these into the table upfront into position 0 (and 1 if different) - v_count[endSequence] = 0; -#endif + unordered_map v_count; // process input line by line string str; @@ -323,18 +324,14 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config) } fp.close(); - std::cerr << "Vocabulary size " << v_count.size() << ".\n"; - - std::vector m_words; - std::set m_remained_words; - std::unordered_map m_index; + cerr << "Vocabulary size " << v_count.size() << ".\n"; - std::vector m_count; - std::vector m_class; // class index of each word + vector m_words; + set m_remained_words; + unordered_map m_index; - typedef std::pair stringdouble; - std::priority_queue, compare_second> - q(compare_second(), std::vector(v_count.begin(), v_count.end())); + vector m_count; + vector m_class; // class index of each word size_t wordCountLessCutoff = v_count.size(); if (cutoff > 0) @@ -348,24 +345,30 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config) if (vocabSize > wordCountLessCutoff) { - std::cerr << "Warning: actual vocabulary size is less than required." << endl; - std::cerr << "\t\tRequired vocabulary size:" << vocabSize << endl; - std::cerr << "\t\tActual vocabulary size:" << v_count.size() << endl; - std::cerr << "\t\tActual vocabulary size after cutoff:" << wordCountLessCutoff << endl; - std::cerr << "\t\tWe will change to actual vocabulary size: " << wordCountLessCutoff << endl; + cerr << "Warning: actual vocabulary size is less than required." << endl; + cerr << "\t\tRequired vocabulary size:" << vocabSize << endl; + cerr << "\t\tActual vocabulary size:" << v_count.size() << endl; + cerr << "\t\tActual vocabulary size after cutoff:" << wordCountLessCutoff << endl; + cerr << "\t\tWe will change to actual vocabulary size: " << wordCountLessCutoff << endl; vocabSize = wordCountLessCutoff; } + + // form classes + // Implements an algorithm by Mikolov --TODO: get the reference wrd2cls.Resize(vocabSize, 1); - std::unordered_map removed; - double unkCount = 0; + typedef pair stringdouble; + unordered_map removed; // note: std::map is supposedly faster + double unkCount = 0; // TODO: why double? size_t size = 0; size_t actual_vocab_size = vocabSize - 1; - while (size < actual_vocab_size && !q.empty()) + priority_queue, compare_second> + q(compare_second(), vector(v_count.begin(), v_count.end())); + while (size < actual_vocab_size && !q.empty()) // ==for (q=...; cond; q.pop()) { size++; - std::string word = q.top().first; - double freq = q.top().second; + string word = q.top().first; + double freq = q.top().second; // TODO: why double? if (word == unkWord) { unkCount += freq; @@ -380,8 +383,6 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config) q.pop(); } removed[unkWord] = unkCount; - std::priority_queue, compare_second> - p(compare_second(), std::vector(removed.begin(), removed.end())); m_count.resize(removed.size()); double total = 0; double dd = 0; @@ -396,11 +397,13 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config) double df = 0; size_t class_id = 0; - m_class.resize(p.size()); + m_class.resize(removed.size()); + priority_queue, compare_second> + p(compare_second(), vector(removed.begin(), removed.end())); while (!p.empty()) { - std::string word = p.top().first; + string word = p.top().first; double freq = p.top().second; if (nbrCls > 0) { @@ -423,9 +426,19 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config) p.pop(); } - std::ofstream ofvocab; + // write the files + if (!outputMappingFile.empty()) + { + msra::files::make_intermediate_dirs(s2ws(outputMappingFile)); + ofstream ofmapping(outputMappingFile.c_str()); + for (size_t i = 0; i < m_index.size(); i++) + ofmapping << m_words[i] << endl; + ofmapping.close(); + cerr << "Created label-mapping file with " << v_count.size() << " entries.\n"; + } + msra::files::make_intermediate_dirs(s2ws(outputVocabFile)); - ofvocab.open(outputVocabFile.c_str()); + ofstream ofvocab(outputVocabFile.c_str()); for (size_t i = 0; i < m_index.size(); i++) { if (nbrCls > 0) @@ -436,10 +449,10 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config) cls2idx(clsIdx, 0) = (ElemType) i; // the left boundary of clsIdx prevClsIdx = m_class[i]; } - ofvocab << " " << i << "\t " << m_count[i] << "\t" << m_words[i] << "\t" << clsIdx << std::endl; + ofvocab << " " << i << "\t " << m_count[i] << "\t" << m_words[i] << "\t" << clsIdx << endl; } ofvocab.close(); - std::cerr << "Created vocabulary file with " << v_count.size() << " entries.\n"; + cerr << "Created vocabulary file with " << v_count.size() << " entries.\n"; if (nbrCls > 0) { @@ -452,7 +465,7 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config) for (size_t r = 0; r < wrd2cls.GetNumRows(); r++) owfp << (int) wrd2cls(r, 0) << endl; owfp.close(); - std::cerr << "Created word-to-class map with " << wrd2cls.GetNumRows() << " entries.\n"; + cerr << "Created word-to-class map with " << wrd2cls.GetNumRows() << " entries.\n"; msra::files::make_intermediate_dirs(s2ws(outputCls2Index)); ofstream ocfp(outputCls2Index.c_str()); @@ -461,7 +474,7 @@ void DoWriteWordAndClassInfo(const ConfigParameters& config) for (size_t r = 0; r < cls2idx.GetNumRows(); r++) ocfp << (int) cls2idx(r, 0) << endl; ocfp.close(); - std::cerr << "Created class-to-index map with " << cls2idx.GetNumRows() << " entries.\n"; + cerr << "Created class-to-index map with " << cls2idx.GetNumRows() << " entries.\n"; } } diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp index 01d398e762b8..1a3fcf792368 100644 --- a/Source/ComputationNetworkLib/ComputationNode.cpp +++ b/Source/ComputationNetworkLib/ComputationNode.cpp @@ -298,11 +298,13 @@ void ComputationNode::WriteMinibatchWithFormatting(FILE* f, size_t onl const string& sequenceSeparator, const string& sequencePrologue, const string& sequenceEpilogue, const string& elementSeparator, const string& sampleSeparator, const string& valueFormatString) const { - // get it (into a flat CPU-side vector) + // get minibatch matrix -> matData, matRows, matStride const Matrix& outputValues = Value(); - size_t tempArraySize = 0; - ElemType* tempArray = nullptr; - outputValues.CopyToArray(tempArray, tempArraySize); + let matRows = outputValues.GetNumRows(); + let matStride = matRows; // how to get from one column to the next + ElemType* matData = nullptr; + size_t matDataSize = 0; + outputValues.CopyToArray(matData, matDataSize); // process all sequences one by one auto pMBLayout = GetMBLayout(); @@ -312,19 +314,21 @@ void ComputationNode::WriteMinibatchWithFormatting(FILE* f, size_t onl pMBLayout->InitAsFrameMode(1); // treat this as if we have one single sample // TODO: This can be done more efficiently, if ever needed. } - const auto& sequences = pMBLayout->GetAllSequences(); - size_t colStride = pMBLayout->GetNumParallelSequences() * outputValues.GetNumRows(); // how to get from one column to the next - size_t width = pMBLayout->GetNumTimeSteps(); + let& sequences = pMBLayout->GetAllSequences(); + let width = pMBLayout->GetNumTimeSteps(); for (size_t s = 0; s < sequences.size(); s++) { const auto& seqInfo = sequences[s]; if (seqInfo.seqId == GAP_SEQUENCE_ID) // nothing in gaps to print continue; - size_t tBegin = seqInfo.tBegin >= 0 ? seqInfo.tBegin : 0; - size_t tEnd = seqInfo.tEnd <= width ? seqInfo.tEnd : width; + let tBegin = seqInfo.tBegin >= 0 ? seqInfo.tBegin : 0; + let tEnd = seqInfo.tEnd <= width ? seqInfo.tEnd : width; - // current sequence is a matrix with 'colStride' beginning at the following pointer - ElemType* pCurValue = tempArray + s * outputValues.GetNumRows() + seqInfo.tBegin; + // get sequence matrix -> seqData, seqRows, seqCols, seqStride + let seqData = matData + pMBLayout->GetColumnIndex(seqInfo, 0) * matStride; + auto seqRows = matRows; + let seqCols = tEnd - tBegin; + let seqStride = pMBLayout->GetNumParallelSequences() * matStride; if (s > 0) fprintfOrDie(f, "%s", sequenceSeparator.c_str()); @@ -332,40 +336,39 @@ void ComputationNode::WriteMinibatchWithFormatting(FILE* f, size_t onl // output it according to our format specification let formatChar = valueFormatString.back(); - size_t dim = outputValues.GetNumRows(); - size_t T = tEnd - tBegin; - if (isCategoryLabel) + if (isCategoryLabel) // if is category then find the max value and output its index (possibly mapped to a string) { if (formatChar == 's') // verify label dimension { if (outputValues.GetNumRows() != labelMapping.size()) - InvalidArgument("write: Row dimension %d does not match number of entries %d in labelMappingFile", (int)dim, (int)labelMapping.size()); + InvalidArgument("write: Row dimension %d does not match number of entries %d in labelMappingFile", (int)seqRows, (int)labelMapping.size()); } // update the matrix in-place from one-hot (or max) to index // find the max in each column - for (size_t j = 0; j < T; j++) + for (size_t j = 0; j < seqCols; j++) // loop over all time steps of the sequence { - double maxPos = -1; + double maxLoc = -1; double maxVal = 0; - for (size_t i = 0; i < dim; i++) + for (size_t i = 0; i < seqRows; i++) // loop over rows { - double val = pCurValue[i + j * dim * colStride]; - if (maxPos < 0 || val >= maxVal) + let val = seqData[i + j * seqStride]; + if (maxLoc < 0 || val >= maxVal) { - maxPos = (double)i; + maxLoc = (double)i; maxVal = val; } } - pCurValue[0 + j * colStride] = (ElemType)maxPos; // overwrite first element in-place + seqData[0 + j * seqStride] = (ElemType)maxLoc; // overwrite first element in-place } - dim = 1; // ignore remaining dimensions + seqRows = 1; // ignore remaining dimensions } - let iend = transpose ? dim : T; // true dimension of the data to print - let jend = transpose ? T : dim; + // bounds for printing + let iend = transpose ? seqRows : seqCols; // true dimension of the data to print + let jend = transpose ? seqCols : seqRows; let istop = transpose ? onlyUpToRow : onlyUpToT; // we stop at these dimensions (for debugging, one often needs only the first few values of those huge matrices) let jstop = transpose ? onlyUpToT : onlyUpToRow; - let istride = transpose ? 1 : colStride; - let jstride = transpose ? colStride : 1; + let istride = transpose ? 1 : seqStride; + let jstride = transpose ? seqStride : 1; for (size_t j = 0; j < jend; j++) { if (j > 0) @@ -384,19 +387,18 @@ void ComputationNode::WriteMinibatchWithFormatting(FILE* f, size_t onl fprintf(f, "...+%d", (int)(iend - istop)); break; } - else if (formatChar == 'f') // print as real number + double dval = seqData[i * istride + j * jstride]; + if (formatChar == 'f') // print as real number { - double dval = pCurValue[i * istride + j * jstride]; fprintfOrDie(f, valueFormatString.c_str(), dval); } else if (formatChar == 'u') // print category as integer index { - unsigned int uval = (unsigned int)pCurValue[i * istride + j * jstride]; - fprintfOrDie(f, valueFormatString.c_str(), uval); + fprintfOrDie(f, valueFormatString.c_str(), (unsigned int)dval); } else if (formatChar == 's') // print category as a label string { - size_t uval = (size_t)pCurValue[i * istride + j * jstride]; + size_t uval = (size_t)dval; assert(uval < labelMapping.size()); const char * sval = labelMapping[uval].c_str(); fprintfOrDie(f, valueFormatString.c_str(), sval); @@ -406,7 +408,7 @@ void ComputationNode::WriteMinibatchWithFormatting(FILE* f, size_t onl fprintfOrDie(f, "%s", sequenceEpilogue.c_str()); } // end loop over sequences - delete[] tempArray; + delete[] matData; } // ----------------------------------------------------------------------- diff --git a/Source/Readers/LMSequenceReader/SequenceReader.cpp b/Source/Readers/LMSequenceReader/SequenceReader.cpp index 838f56402fc8..b3759342e7dc 100644 --- a/Source/Readers/LMSequenceReader/SequenceReader.cpp +++ b/Source/Readers/LMSequenceReader/SequenceReader.cpp @@ -1506,6 +1506,7 @@ void BatchSequenceReader::InitFromConfig(const ConfigRecordType& reade } else { + fprintf(stderr, "LMSequenceReader: Label mapping will be created internally on the fly because the labelMappingFile was not found: %ls\n", labelPath.c_str()); if (wClassFile != L"") { #if 0 @@ -1538,6 +1539,7 @@ void BatchSequenceReader::InitFromConfig(const ConfigRecordType& reade } labelInfo.mapName = labelPath; labelInfo.fileToWrite = labelPath; // mapping path denotes an output: write the mapping here at the end + // BUGBUG: This facility is not functional. No file is being created. } } From fcbc749a00d478a68445d7c2113469984c48645a Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Mon, 14 Mar 2016 21:05:04 -0700 Subject: [PATCH 17/26] (incorporated minor code-review feedback) --- Source/ComputationNetworkLib/ReshapingNodes.cpp | 12 ++++++------ Source/ComputationNetworkLib/ReshapingNodes.h | 2 +- Source/SGDLib/SGD.cpp | 4 ++-- Source/SGDLib/SGD.h | 3 ++- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp index 9f07164d63b2..d2860fda66c6 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.cpp +++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp @@ -32,17 +32,17 @@ struct SequenceLengthVector { typedef vector> SequenceVector; typedef MBLayout::SequenceInfo SequenceInfo; - const SequenceVector& sequenceVector; // - const vector& sequenceInfo; // original sequence info (for seqId) - SequenceLengthVector(const vector& sequenceInfo, const SequenceVector& sequenceVector) : sequenceInfo(sequenceInfo), sequenceVector(sequenceVector) { } - size_t size() const { return sequenceInfo.size(); } + const SequenceVector& m_sequenceVector; // vector of sequences (to get sequence length) + const vector& m_sequenceInfo; // original sequence info (for seqId) + SequenceLengthVector(const vector& sequenceInfo, const SequenceVector& sequenceVector) : m_sequenceInfo(sequenceInfo), m_sequenceVector(sequenceVector) { } + size_t size() const { return m_sequenceInfo.size(); } MBLayout::SequenceInfo operator[](size_t i) const // return a descriptor of the new sequence { SequenceInfo seq; - seq.seqId = sequenceInfo[i].seqId; + seq.seqId = m_sequenceInfo[i].seqId; seq.s = i; seq.tBegin = 0; - seq.tEnd = sequenceVector[i].size(); + seq.tEnd = m_sequenceVector[i].size(); return seq; } void operator=(const SequenceLengthVector&) = delete; diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h index dbade27997b3..7c186e9b421a 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.h +++ b/Source/ComputationNetworkLib/ReshapingNodes.h @@ -560,7 +560,7 @@ template class RowRepeatNode; // ----------------------------------------------------------------------- // WhereNode(cond) -- extract indices of non-0 values in a sequence -// As this implies a runtime-vale dependent reduction in dimension, it can +// As this implies a runtime-value dependent reduction in dimension, it can // only be applied to time sequences, and not other tensor dimensions. // The result will have a different MBLayout reflecting the shortened result sequences. // ----------------------------------------------------------------------- diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp index 5b086aa709a7..17046306c497 100644 --- a/Source/SGDLib/SGD.cpp +++ b/Source/SGDLib/SGD.cpp @@ -73,9 +73,9 @@ void SGD::Train(function createN // set tracing flags for (const auto& traceNodeName : m_traceNodeNamesReal) - net->GetNodeFromName(traceNodeName)->EnableNodeTracing(false); + net->GetNodeFromName(traceNodeName)->EnableNodeTracing(/*isCategoryLabel=*/false); for (const auto& traceNodeName : m_traceNodeNamesCategory) - net->GetNodeFromName(traceNodeName)->EnableNodeTracing(true); + net->GetNodeFromName(traceNodeName)->EnableNodeTracing(/*isCategoryLabel=*/true); TrainOrAdaptModel(startEpoch, net, loadNetworkFromCheckpoint, net, nullptr, trainSetDataReader, validationSetDataReader); } diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h index 70f0889a0114..2278f61da7a4 100644 --- a/Source/SGDLib/SGD.h +++ b/Source/SGDLib/SGD.h @@ -507,7 +507,8 @@ class SGD : public SGDParams wstring m_evalCriterionNodeName; // enable tracing. Nodes listed here get their m_traceNodeValue and m_traceNodeValueAsCategoryLabel flags set - vector m_traceNodeNamesReal, m_traceNodeNamesCategory; + vector m_traceNodeNamesReal; + vector m_traceNodeNamesCategory; size_t m_prevChosenMinibatchSize; double m_lastFinishedEpochTrainLoss; From 62ca8680c5ab054b1d56ce8b86362abd151543bf Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Tue, 15 Mar 2016 08:08:20 -0700 Subject: [PATCH 18/26] bug fix: Backprop() must be prepared to run on a node without gradient yet having been allocated; bug fix: HardmaxNode must not consider calling BackpropTo() on it an error; bug fix: DelayNodeBase must never deserialize the matrix row dimension --- .../PennTreebank/Config/S2SAutoEncoder.cntk | 524 ++++++++---------- .../CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs | 13 +- Source/Common/Include/TensorShape.h | 3 +- .../ComputationNetworkAnalysis.cpp | 2 +- .../ComputationNetworkLib/ComputationNode.cpp | 65 ++- .../ComputationNetworkLib/ComputationNode.h | 42 +- .../ComputationNetworkLib/NonlinearityNodes.h | 23 +- Source/ComputationNetworkLib/RecurrentNodes.h | 30 +- Source/ComputationNetworkLib/ReshapingNodes.h | 2 +- 9 files changed, 329 insertions(+), 375 deletions(-) diff --git a/Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk b/Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk index c8d0e722da1b..5bf886bfccdd 100644 --- a/Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk +++ b/Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk @@ -5,9 +5,9 @@ #################### # Command line to run in debugger: -# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk RunDir=$(SolutionDir)Examples/Text/PennTreebank/_run RootDir=$(SolutionDir)Examples/Text/PennTreebank/_run DataDir=$(SolutionDir)Examples/Text/PennTreebank/Data ConfigDir=$(SolutionDir)Examples/Text/PennTreebank/Config stderr=$(SolutionDir)Examples/Text/PennTreebank/_run/Simple.log train=[SGD=[maxEpochs=1]] train=[epochSize=2048]] confVocabSize=1000 DeviceId=0 makeMode=false +# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk RunDir=$(SolutionDir)Examples/Text/PennTreebank/_run RootDir=$(SolutionDir)Examples/Text/PennTreebank/_run DataDir=$(SolutionDir)Examples/Text/PennTreebank/Data ConfigDir=$(SolutionDir)Examples/Text/PennTreebank/Config stderr=$(SolutionDir)Examples/Text/PennTreebank/_run/S2SAutoEncoder.log train=[SGD=[maxEpochs=1]] confVocabSize=1000 DeviceId=-1 makeMode=false # Append this for small set: -# trainFile=ptb.small.train.txt validFile=ptb.small.valid.txt testFile=ptb.small.test.txt +# train=[epochSize=2048]] trainFile=ptb.small.train.txt validFile=ptb.small.valid.txt testFile=ptb.small.test.txt # It implements a sequence-to-sequence based auto-encoder. # It encodes an entire sentence into a flat vector, and tries to regenerate it. @@ -37,14 +37,16 @@ modelPath = "$ModelDir$/S2SAutoEncoder.dnn" # uncomment the following line to write logs to a file #stderr=$OutputDir$/rnnOutput -numCPUThreads = 1 +#numCPUThreads = 1 confVocabSize = 10000 confClassSize = 50 useStabilizer = true -trainFile = "ptb.train.txt" +#trainFile = "ptb.train.txt" +trainFile = "ptb.small.train.txt" validFile = "ptb.valid.txt" +#validFile = "ptb.small.valid.txt" testFile = "ptb.test.txt" ####################################### @@ -53,11 +55,20 @@ testFile = "ptb.test.txt" BrainScriptNetworkBuilder = [ + # import general config options from outside config values + vocabDim = $confVocabSize$ + nbrClass = $confClassSize$ + + useStabilizer = $useStabilizer$ + useEncoder = true // if false, this becomes a regular RNN + # import some namespaces - RecurrentLSTMP = BS.RNNs.RecurrentLSTMP Parameters = BS.Parameters - Loop = BS.Loop - Boolean = BS.Boolean + Constants = BS.Constants + Sequences = BS.Sequences + Loop = BS.Loop + Boolean = BS.Boolean + RecurrentLSTMP = BS.RNNs.RecurrentLSTMP # define an LSTM with a per-sequence initialization value # TODO: Not currently used. Move to BS library once tested. @@ -82,12 +93,6 @@ BrainScriptNetworkBuilder = [ ] ].lstmState.h // that's the value we return - # import general config options from outside config values - vocabDim = $confVocabSize$ - nbrClass = $confClassSize$ - - useStabilizer = $useStabilizer$ - embeddingDim = 300 hiddenDim = 200 @@ -102,7 +107,10 @@ BrainScriptNetworkBuilder = [ # embedding E = Parameters.WeightParam (vocabDim, embeddingDim) # note: this is assumed to be applied transposed, hence the swapped dimensions - Embed (x) = TransposeTimes (E, Parameters.Stabilize (x, enabled=useStabilizer)) # embeddings are linear, so better stabilize. We really should use BatchNorm. + Embed (x) = TransposeTimes (E, Parameters.Stabilize (x, enabled=useStabilizer)) # embeddings are linear, so better stabilize. We really should use BatchNorm. + #E = Parameters.WeightParam (embeddingDim, vocabDim) # note: this is assumed to be applied transposed, hence the swapped dimensions + #Embed (x) = new ComputationNode [ operation = 'LookupTable' ; inputs = (E : Parameters.Stabilize (x, enabled=useStabilizer)) ; tag = '' ] + inputEmbedded = Embed (input) labelsEmbedded = Embed (labels) @@ -124,16 +132,18 @@ BrainScriptNetworkBuilder = [ /*then*/ x, // then copy that /*else*/ FutureValue (0, result)) // else just propagate to the front ].result - thoughtVectorDim = decoderDims[decoderOutputLayer] + thoughtVectorDim = encoderDims[encoderOutputLayer] # decoder # The decoder starts with hidden state 0 # and takes as input (thoughtVector; previous word). + decoderInputDim = if useEncoder then thoughtVectorDim + embeddingDim else embeddingDim + decoderInput = if useEncoder then RowStack (thoughtVector : Loop.Previous (decoderFeedback)) else Loop.Previous (decoderFeedback) decoderOutputLayer = Length (decoderDims)-1 decoder[i:0..decoderOutputLayer] = if i == 0 - then RecurrentLSTMP (thoughtVectorDim + embeddingDim, decoderDims[i], decoderDims[i], - RowStack (thoughtVector : Loop.Previous (labelsEmbedded)), + then RecurrentLSTMP (decoderInputDim, decoderDims[i], decoderDims[i], + decoderInput, enableSelfStabilization=useStabilizer) else RecurrentLSTMP (decoderDims[i-1], decoderDims[i], decoderDims[i], decoder[i-1], @@ -145,12 +155,34 @@ BrainScriptNetworkBuilder = [ W(x) = Parameters.WeightParam (vocabDim, decoderDim) * Parameters.Stabilize (x, enabled=useStabilizer) B = Parameters.BiasParam (vocabDim) - z = W(decoderOutput) + B; // top-level input to Softmax + z = W(decoderOutput) + B; // top-level input to Softmax + + decoderOutputEmbedded = Embed (Hardmax (z)) + + # decoder feedback differs between training and test + isTraining = EnvironmentInput('isTraining', tag='eval') + #decoderFeedback = labelsEmbedded + # BUGBUG: This does not work: + decoderFeedback = Boolean.If (isTraining, labelsEmbedded, decoderOutputEmbedded) + # 'decoderFeedback' gets topo-sorted to the end of the loop, which is the wrong entry point, it must be way down; 'z' is the rigth entry point from the top + + # exclude the first token, which is sentence start. Don't want to train on that. + CastAs (type, data) = Sequences.Scatter (Constants.OnesLike (type), data) + + SkipFirst (x) = Sequences.Skip (1, x) + z1 = SkipFirst (z) + labels1 = CastAs (z1, SkipFirst (labels)) # training criteria - # The target is the full sequence including and . - ce = CrossEntropyWithSoftmax(labels, z, tag='criterion') // this is the training objective - wer = ErrorPrediction(labels, z, tag='eval') // this also gets tracked + ce = CrossEntropyWithSoftmax(labels1, z1, tag='criterion') // this is the training objective + wer = ErrorPrediction (labels1, z1, tag='eval') // this also gets tracked + + #indexTestVals = Plus (decoderOutput, BS.Constants.Zero, tag='eval') + #indexTest = RowSlice (0, 1, indexTestVals) + #index = Where (RectifiedLinear (indexTest), tag='eval'); // for testing: this thresholds all negative numbers to 0=false, keeping positive as !=0=true + #packedIndex = PackedIndex (indexTest, index, tag='eval') + #filtered = GatherPacked (packedIndex, indexTestVals, tag='eval') + #unfiltered = ScatterPacked (indexTest, packedIndex, filtered, tag='eval') ] ####################################### @@ -158,8 +190,10 @@ BrainScriptNetworkBuilder = [ ####################################### reader = [ - readerType = LMSequenceReader + file = "$DataDir$/$trainFile$" #randomize = "auto" # gets ignored + + readerType = LMSequenceReader mode = "softmax" nbruttsineachrecurrentiter = 0 # 0 means auto-fill given minibatch size cacheBlockSize = 100000000 # read block size. This value is large enough to load entire corpus at once @@ -180,7 +214,73 @@ reader = [ #windowSize - number of records we should include in BinaryWriter window windowSize = 10000 - file = "$DataDir$/$trainFile$" + # additional features sections + # For input labels, we need both 'features' and the first labels section (called 'inputLabelsDef' below) + input = [ + dim = 0 # no (explicit) labels ...labelDim correct?? + ### write definition + sectionType = "data" + ] + # labels sections + # TODO: seems we must specify two labels (in and out), but labelType = "none" is allowed + # labels sections --this is required, but our labels are extracted from the inLabels + inputLabelsDef = [ # BUGBUG: Make sure that this section name comes before the dummy output labels alphabetically + dim = 1 + + # vocabulary size + labelType = "category" + labelDim = "$confVocabSize$" + labelMappingFile = "$ModelDir$/vocab.wl" + beginSequence = "" + endSequence = "" + + #### Write definition #### + # sizeof(unsigned) which is the label index type + elementSize=4 + sectionType=labels + mapping = [ + #redefine number of records for this section, since we don't need to save it for each data record + wrecords=11 + #variable size so use an average string size + elementSize=10 + sectionType=labelMapping + ] + category = [ + dim=11 + #elementSize=sizeof(ElemType) is default + sectionType=categoryLabels + ] + ] + outputDummy = [ + labelType = "none" + ] +] + +cvReader = [ + file = "$DataDir$/$validFile$" + #randomize = "none" # gets ignored + + # everything below here is duplicated from 'reader' + readerType = LMSequenceReader + mode = "softmax" + nbruttsineachrecurrentiter = 0 # 0 means auto-fill given minibatch size + cacheBlockSize = 100000000 # read block size. This value is large enough to load entire corpus at once + + # word class info + wordclass = "$ModelDir$/vocab.txt" + + #### write definition + # if writerType is set, we will cache to a binary file + # if the binary file exists, we will use it instead of parsing this file + #writerType = BinaryReader + wfile = $CacheDir$\sequenceSentence.bin + # if calculated size would be bigger, that is used instead + wsize = 256 + #wrecords - number of records we should allocate space for in the file + # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file + wrecords = 1000 + #windowSize - number of records we should include in BinaryWriter window + windowSize = 10000 # additional features sections # For input labels, we need both 'features' and the first labels section (called 'inputLabelsDef' below) @@ -198,8 +298,8 @@ reader = [ # vocabulary size labelType = "category" labelDim = "$confVocabSize$" - labelMappingFile = "$OutputDir$/sentenceLabels.txt" - beginSequence = "" + labelMappingFile = "$ModelDir$/vocab.wl" + beginSequence = "" endSequence = "" #### Write definition #### @@ -233,6 +333,7 @@ writeWordAndClassInfo = [ inputFile = "$DataDir$/$trainFile$" beginSequence = "" endSequence = "" + outputMappingFile = "$ModelDir$/vocab.wl" outputVocabFile = "$ModelDir$/vocab.txt" outputWord2Cls = "$ModelDir$/word2cls.txt" outputCls2Index = "$ModelDir$/cls2idx.txt" @@ -250,14 +351,14 @@ train = [ action = "train" traceLevel = 1 epochSize = 0 # (for quick tests, this can be overridden with something small) - useValidation = false # true # TODO: need to adapt cvReader as well #BrainScriptNetworkBuilder is defined in outer scope SGD = [ - minibatchSize = 128:256:512 # TODO: Why is this here and not inside SGD? - learningRatesPerSample = 0.1 - momentumPerMB = 0 + minibatchSize = 128:256:512 + learningRatesPerSample = 0.01 + #momentumPerMB = 0 + momentumAsTimeConstant = 2500 gradientClippingWithTruncation = true # TODO: clip and truncate? What is the difference? clippingThresholdPerSample = 15.0 maxEpochs = 16 @@ -265,6 +366,14 @@ train = [ gradUpdateType = "none" # FSAdaGrad? loadBestModel = true + # tracing (enable these for debugging) + #traceNodeNamesReal = labelsEmbedded:decoderInput:"decoder[0].lstmState._privateInnards.ht":z.Plus_left.Times_right.result:z:ce + #traceNodeNamesReal = labelsEmbedded:decoderInput:z:ce + #traceNodeNamesReal = thoughtVector.result:zMask:z:ce:wer:indexTestVals:index:packedIndex:filtered:unfiltered:isTraining + #traceNodeNamesCategory = input + + dropoutRate = 0.0 + # settings for Auto Adjust Learning Rate AutoAdjust = [ autoAdjustLR = "adjustAfterEpoch" @@ -277,115 +386,6 @@ train = [ numPrevLearnRates = 5 numBestSearchEpoch = 1 ] - - dropoutRate = 0.0 - ] - - # if a cvReader section is specified, SGD will use this to compute the CV criterion - # TODO: adapt this - _hidden_cvReader = [ - # reader to use - readerType = "LMSequenceReader" - randomize = "none" - nbruttsineachrecurrentiter = 0 # 0 means fill up the minibatch with as many parallel sequences as fit - cacheBlockSize = 2000000 # just load it all - - # word class info - wordclass = "$ModelDir$/vocab.txt" - - # if writerType is set, we will cache to a binary file - # if the binary file exists, we will use it instead of parsing this file - # writerType = "BinaryReader" - - # write definition - wfile = "$OutputDir$/sequenceSentence.valid.bin" - - # wsize - inital size of the file in MB - # if calculated size would be bigger, that is used instead - wsize = 256 - - # wrecords - number of records we should allocate space for in the file - # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file - wrecords = 1000 - - # windowSize - number of records we should include in BinaryWriter window - windowSize = "$confVocabSize$" - - file = "$DataDir$/$validFile$" - - # additional features sections - # for now store as expanded category data (including label in) - features = [ - # sentence has no features, so need to set dimension to zero - dim = 0 - # write definition - sectionType = "data" - ] - - # labels sections - # it should be the same as that in the training set - labelIn = [ - dim = 1 - - # vocabulary size - labelDim = "$confVocabSize$" - labelMappingFile = "$OutputDir$/sentenceLabels.out.txt" - - labelType = "Category" - beginSequence = "" - endSequence = "" - - # Write definition - # sizeof(unsigned) which is the label index type - elementSize = 4 - sectionType = "labels" - - mapping = [ - # redefine number of records for this section, since we don't need to save it for each data record - wrecords = 11 - # variable size so use an average string size - elementSize = 10 - sectionType = "labelMapping" - ] - - category = [ - dim = 11 - # elementSize = sizeof(ElemType) is default - sectionType = "categoryLabels" - ] - ] - - #labels sections - labels = [ - dim = 1 - - labelType = "NextWord" - beginSequence = "O" - endSequence = "O" - - # vocabulary size - labelDim = "$confVocabSize$" - labelMappingFile = "$OutputDir$/sentenceLabels.out.txt" - - # Write definition - # sizeof(unsigned) which is the label index type - elementSize = 4 - sectionType = "labels" - - mapping = [ - # redefine number of records for this section, since we don't need to save it for each data record - wrecords = 3 - # variable size so use an average string size - elementSize = 10 - sectionType = "labelMapping" - ] - - category = [ - dim = 3 - # elementSize = sizeof(ElemType) is default - sectionType = "categoryLabels" - ] - ] ] ] @@ -403,104 +403,70 @@ test = [ epochSize = 0 reader = [ - # reader to use - readerType = "LMSequenceReader" - randomize = "none" - nbruttsineachrecurrentiter = 0 # 0 means fill up the minibatch with as many parallel sequences as fit - cacheBlockSize = 2000000 # just load it all - + file = "$DataDir$/$testFile$" + #randomize = "none" # gets ignored + + # everything below here is duplicated from 'reader' + readerType = LMSequenceReader + mode = "softmax" + nbruttsineachrecurrentiter = 0 # 0 means auto-fill given minibatch size + cacheBlockSize = 100000000 # read block size. This value is large enough to load entire corpus at once + # word class info wordclass = "$ModelDir$/vocab.txt" - + + #### write definition # if writerType is set, we will cache to a binary file # if the binary file exists, we will use it instead of parsing this file - # writerType = "BinaryReader" - - # write definition - wfile = "$OutputDir$/sequenceSentence.bin" - # wsize - inital size of the file in MB + #writerType = BinaryReader + wfile = $CacheDir$\sequenceSentence.bin # if calculated size would be bigger, that is used instead wsize = 256 - - # wrecords - number of records we should allocate space for in the file + #wrecords - number of records we should allocate space for in the file # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file wrecords = 1000 - - # windowSize - number of records we should include in BinaryWriter window - windowSize = "$confVocabSize$" - - file = "$DataDir$/$testFile$" - + #windowSize - number of records we should include in BinaryWriter window + windowSize = 10000 + # additional features sections - # for now store as expanded category data (including label in) - features = [ - # sentence has no features, so need to set dimension to zero - dim = 0 - # write definition + # For input labels, we need both 'features' and the first labels section (called 'inputLabelsDef' below) + input = [ + dim = 0 # no (explicit) labels ...labelDim correct?? + ### write definition sectionType = "data" ] - - #labels sections - labelIn = [ + # labels sections + # TODO: seems we must specify two labels (in and out), but labelType = "none" is allowed + # labels sections --this is required, but our labels are extracted from the inLabels + inputLabelsDef = [ # BUGBUG: Make sure that this section name comes before the dummy output labels alphabetically dim = 1 - + # vocabulary size + labelType = "category" labelDim = "$confVocabSize$" - labelMappingFile = "$OutputDir$/sentenceLabels.txt" - - labelType = "Category" + labelMappingFile = "$ModelDir$/vocab.wl" beginSequence = "" - endSequence = "" - - # Write definition + endSequence = "" + + #### Write definition #### # sizeof(unsigned) which is the label index type - elementSize = 4 - sectionType = "labels" - + elementSize=4 + sectionType=labels mapping = [ - # redefine number of records for this section, since we don't need to save it for each data record - wrecords = 11 - # variable size so use an average string size - elementSize = 10 - sectionType = "labelMapping" + #redefine number of records for this section, since we don't need to save it for each data record + wrecords=11 + #variable size so use an average string size + elementSize=10 + sectionType=labelMapping ] - category = [ - dim = 11 - # elementSize = sizeof(ElemType) is default - sectionType = "categoryLabels" + dim=11 + #elementSize=sizeof(ElemType) is default + sectionType=categoryLabels ] ] - - #labels sections - labels = [ - dim = 1 - labelType = "NextWord" - beginSequence = "O" - endSequence = "O" - - # vocabulary size - labelDim = "$confVocabSize$" - - labelMappingFile = "$OutputDir$/sentenceLabels.out.txt" - # Write definition - # sizeof(unsigned) which is the label index type - elementSize = 4 - sectionType = "labels" - - mapping = [ - # redefine number of records for this section, since we don't need to save it for each data record - wrecords = 3 - # variable size so use an average string size - elementSize = 10 - sectionType = "labelMapping" - ] - - category = [ - dim = 3 - # elementSize = sizeof(ElemType) is default - sectionType = "categoryLabels" - ] + outputDummy = [ + labelType = "none" ] ] ] @@ -527,10 +493,12 @@ write = [ outputPath = "$OutputDir$/Write" #outputPath = "-" # "-" will write to stdout; useful for debugging - outputNodeNames = TrainNodeClassBasedCrossEntropy # when processing one sentence per minibatch, this is the sentence posterior + outputNodeNames = z # when processing one sentence per minibatch, this is the sentence posterior format = [ - sequencePrologue = "log P(W)=" # (using this to demonstrate some formatting strings) - type = "real" + type = "category" + transpose = false + labelMappingFile = "$ModelDir$/vocab.wl" + #sequencePrologue = "log P(W)=" # (using this to demonstrate some formatting strings) ] minibatchSize = 8192 # choose this to be big enough for the longest sentence @@ -539,104 +507,70 @@ write = [ epochSize = 0 reader = [ - # reader to use - readerType = "LMSequenceReader" - randomize = "none" # BUGBUG: This is ignored. - nbruttsineachrecurrentiter = 1 # one sentence per minibatch - cacheBlockSize = 1 # workaround to disable randomization - + file = "$DataDir$/$testFile$" + #randomize = "none" # gets ignored + + # everything below here is duplicated from 'reader' + readerType = LMSequenceReader + mode = "softmax" + nbruttsineachrecurrentiter = 1 # 0 means auto-fill given minibatch size + cacheBlockSize = 1 #00000000 # read block size. This value is large enough to load entire corpus at once + # word class info wordclass = "$ModelDir$/vocab.txt" - + + #### write definition # if writerType is set, we will cache to a binary file # if the binary file exists, we will use it instead of parsing this file - # writerType = "BinaryReader" - - # write definition - wfile = "$OutputDir$/sequenceSentence.bin" - # wsize - inital size of the file in MB + #writerType = BinaryReader + wfile = $CacheDir$\sequenceSentence.bin # if calculated size would be bigger, that is used instead wsize = 256 - - # wrecords - number of records we should allocate space for in the file + #wrecords - number of records we should allocate space for in the file # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file wrecords = 1000 - - # windowSize - number of records we should include in BinaryWriter window - windowSize = "$confVocabSize$" - - file = "$DataDir$/$testFile$" - + #windowSize - number of records we should include in BinaryWriter window + windowSize = 10000 + # additional features sections - # for now store as expanded category data (including label in) - features = [ - # sentence has no features, so need to set dimension to zero - dim = 0 - # write definition + # For input labels, we need both 'features' and the first labels section (called 'inputLabelsDef' below) + input = [ + dim = 0 # no (explicit) labels ...labelDim correct?? + ### write definition sectionType = "data" ] - - #labels sections - labelIn = [ + # labels sections + # TODO: seems we must specify two labels (in and out), but labelType = "none" is allowed + # labels sections --this is required, but our labels are extracted from the inLabels + inputLabelsDef = [ # BUGBUG: Make sure that this section name comes before the dummy output labels alphabetically dim = 1 - + # vocabulary size + labelType = "category" labelDim = "$confVocabSize$" - labelMappingFile = "$OutputDir$/sentenceLabels.txt" - - labelType = "Category" + labelMappingFile = "$ModelDir$/vocab.wl" beginSequence = "" - endSequence = "" - - # Write definition + endSequence = "" + + #### Write definition #### # sizeof(unsigned) which is the label index type - elementSize = 4 - sectionType = "labels" - + elementSize=4 + sectionType=labels mapping = [ - # redefine number of records for this section, since we don't need to save it for each data record - wrecords = 11 - # variable size so use an average string size - elementSize = 10 - sectionType = "labelMapping" + #redefine number of records for this section, since we don't need to save it for each data record + wrecords=11 + #variable size so use an average string size + elementSize=10 + sectionType=labelMapping ] - category = [ - dim = 11 - # elementSize = sizeof(ElemType) is default - sectionType = "categoryLabels" + dim=11 + #elementSize=sizeof(ElemType) is default + sectionType=categoryLabels ] ] - - #labels sections - labels = [ - dim = 1 - labelType = "NextWord" - beginSequence = "O" - endSequence = "O" - - # vocabulary size - labelDim = "$confVocabSize$" - - labelMappingFile = "$OutputDir$/sentenceLabels.out.txt" - # Write definition - # sizeof(unsigned) which is the label index type - elementSize = 4 - sectionType = "labels" - - mapping = [ - # redefine number of records for this section, since we don't need to save it for each data record - wrecords = 3 - # variable size so use an average string size - elementSize = 10 - sectionType = "labelMapping" - ] - - category = [ - dim = 3 - # elementSize = sizeof(ElemType) is default - sectionType = "categoryLabels" - ] + outputDummy = [ + labelType = "none" ] ] ] diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs index 94108f9d3853..7eb279bf4e88 100644 --- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs +++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs @@ -191,8 +191,8 @@ Sequences = [ # Gather and Scatter # We go through 3 nodes each to take advantage of x - Gather (cond, x) = GatherPacked ( PackedIndex (x, Where (cond)), x) - Scatter (cond, y) = ScatterPacked (cond, PackedIndex (y, Where (cond)), y) + Gather (cond, x) = GatherPacked ( PackedIndex (x, Where (cond)), x) # 'cond' matches 'x' + Scatter (cond, y) = ScatterPacked (cond, PackedIndex (y, Where (cond)), y) # 'cond' matches the result # sequence-altering LINQ-like operators # These generate new data packing (MBLayouts) @@ -214,10 +214,10 @@ Sequences = [ selected = Loop._IsWithin (DelayFn, N, x) out = Gather (selected, x) ].out - Skip (N, x) = _Skip (PastValue, N, x) + Skip (N, x) = if N > 0 then _Skip (PastValue, N, x) else x _Skip (DelayFn, N, x) = [ // TODO: merge with _Take selected = Loop._IsWithin (DelayFn, N, x) - out = Gather (!selected, x) + out = Gather (Boolean.Not (selected), x) ].out ElementAt (n, x) = [ // not efficient, as it filters twice. Better AND the predicates. TODO: what if n is out of range? ElementAtOrDefault startMask = Skip (n, x) // ...000111... @@ -228,10 +228,9 @@ Sequences = [ #FirstOrDefault (x) = ? // can empty sequences exist or even be represented by CNTK? - #Last (x) = _Take (FutureValue, 1, x) - Average (x) = Sum (x) / Loop.Count(x) // TODO: patch opQuotient to check 0/0 = 0 - Sum (x) = FoldL (Plus, 0, x) + Sum (x) = FoldL (Plus, 0, x) + LogSum (x) = FoldL (LogPlus, 0, x) #Max (x) = FoldL (^.Max, ?, x) // TODO: name clash; need to implement ^. #Min (x) = FoldL (^.Min, ?, x) // TODO: what's the init value? All (x) = FoldL (Boolean.And, OnesLike (x), x) diff --git a/Source/Common/Include/TensorShape.h b/Source/Common/Include/TensorShape.h index cf5ef0dd3d0b..79495e2e462f 100644 --- a/Source/Common/Include/TensorShape.h +++ b/Source/Common/Include/TensorShape.h @@ -785,4 +785,5 @@ struct ImageDimensions return AsTensorShape(m_width, m_height, m_numChannels, imageLayoutKind); } }; -} } } + +}}} diff --git a/Source/ComputationNetworkLib/ComputationNetworkAnalysis.cpp b/Source/ComputationNetworkLib/ComputationNetworkAnalysis.cpp index 9cc89cffa333..4db811c7ce57 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkAnalysis.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkAnalysis.cpp @@ -91,7 +91,7 @@ void ComputationNetwork::FormRecurrentLoops(const ComputationNodeBasePtr& rootNo const auto& node = iter->m_nestedNodes[j]; for (size_t i = 0; i < node->GetNumInputs(); i++) { - if (node->Input(i)->m_loopId == node->m_loopId && GetRecurrenceSteppingDirection(node) == 0) + if (node->Input(i)->m_loopId == node->m_loopId && GetRecurrenceSteppingDirection(node) == 0/*not a Delay node*/) { // assert(node->Input(i)->m_indexInLoop == 0); // No. It seems this variable really counts the number of parents. node->Input(i)->m_indexInLoop++; // BUGBUG: this is bumping up the m_indexInLoop, but I don't think it is initialized anywhere other than PurgeStateForFormingRecurrentLoops(). i-1? diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp index 1a3fcf792368..e0f63046baa2 100644 --- a/Source/ComputationNetworkLib/ComputationNode.cpp +++ b/Source/ComputationNetworkLib/ComputationNode.cpp @@ -18,6 +18,59 @@ namespace Microsoft { namespace MSR { namespace CNTK { using namespace std; +// ----------------------------------------------------------------------- +// subroutines for evaluation +// ----------------------------------------------------------------------- + +template +void ComputationNode::Backprop(const FrameRange& fr, bool childrenInThisLoop, bool childrenInOuterLoop) /*override*/ +{ + // Normally our gradient matrix was created as an input of another node. + // This does not happen though in the special case of a node inside a loop + // that no consumer outside depends on. Those might get topologically sorted + // after nodes that propagate outside of the loop, and thus, in the last + // time step of the sequence, have not yet received a gradient from a parent + // and thus may not have had their gradient matrices allocated. + if (m_needsGradient) + LazyZeroGradient(); // set gradient to 0 if this is the first time + + if (fr.IsAllFrames() && IsPartOfLoop() && childrenInThisLoop) + LogicError("%ls %ls operation: Backprop called with whole-batch FrameRange on node that participates in a loop", NodeName().c_str(), OperationName().c_str()); + + for (size_t i = 0; i < m_inputs.size(); i++) + { + ComputationNodePtr child = Input(i); + if (child->m_needsGradient && + ((childrenInThisLoop && child->IsPartOfLoop() == IsPartOfLoop()) || + (childrenInOuterLoop && child->IsPartOfLoop() != IsPartOfLoop()) )) + { + // fprintf(stderr, "Backprop: %ls %ls operation -> child %d %ls %ls\n", NodeName().c_str(), OperationName().c_str(), (int)i, child->NodeName().c_str(), child->OperationName().c_str()); + if (!m_needsGradient) + LogicError("%ls %ls operation has m_needsGradient set to false but children require it.", NodeName().c_str(), OperationName().c_str()); +#if DUMPOUTPUT + fprintf(stderr, "Backprop%d_%ls\n", i, NodeName().c_str()); +#endif + child->LazyZeroGradient(); // set gradient to 0 if this is the first time + + // If we propagate from a loop to a node that is outside the loop, we are not efficient. + // This case is handled by SEQTraversalFlowControlNode::Backprop(). + // The check below is to verify that. + if (IsPartOfLoop() && !child->IsPartOfLoop() && !fr.IsAllFrames()) + { + LogicError("Backprop: Inefficiency: %ls %ls operation in loop propagates gradient to non-loop %ls %ls\n", + NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str(), child->OperationName().c_str()); + } + + // fprintf(stderr, "BackpropTo %d %d %ls %ls\n", (int)fr.timeIdxInSeq, (int)i, NodeName().c_str(), OperationName().c_str()); + BackpropTo(i, fr); // this computes partial wrt to the child and sums the gradient value in the child + } +#ifdef DISPLAY_DEBUG + else + fprintf(stderr, " [%lu]: %s(%s) (no gradient needed so don't compute for)\n", i, child->OperationName().c_str(), child->NodeName().c_str()); +#endif + } +} + // ----------------------------------------------------------------------- // subroutines for Validate() implementations // ----------------------------------------------------------------------- @@ -116,10 +169,14 @@ void ComputationNodeBase::ValidateBinaryReduce(bool isFinalValidationPass) ComputationNodeBase::Validate(isFinalValidationPass); m_pMBLayout = nullptr; // this node does not hold mini-batch data ValidateInferBinaryInputDims(); - if (isFinalValidationPass && - !(Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) && // TODO: Do we need broadcasting for these cases? - (Input(0)->GetMBLayout() == Input(1)->GetMBLayout() || !Input(0)->HasMBLayout() || !Input(1)->HasMBLayout()))) - LogicError("The Matrix dimensions or MB layout in the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str()); + if (isFinalValidationPass) + { + // inputs must have identical layouts and must be minibatch data + if (!(Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()))) + LogicError("The Matrix dimensions in the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str()); + if (Input(0)->GetMBLayout() != Input(1)->GetMBLayout() || !Input(0)->HasMBLayout() || !Input(1)->HasMBLayout()) + LogicError("The MB layout in the %ls %ls operation do not match.", NodeName().c_str(), OperationName().c_str()); + } SetDims(TensorShape(1), false); } diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 976c83cd72db..ba0365d05dfd 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -1339,47 +1339,7 @@ class ComputationNode : public ComputationNodeBase // abstract class that cannot // this is the entry point from Network; while it will call virtual BackpropTo() into the actual node implementation // TODO: move to -Base (or -Network?) - void Backprop(const FrameRange& fr, bool childrenInThisLoop, bool childrenInOuterLoop) override - { - if (fr.IsAllFrames() && IsPartOfLoop() && childrenInThisLoop) - LogicError("%ls %ls operation: Backprop called with whole-batch FrameRange on node that participates in a loop", NodeName().c_str(), OperationName().c_str()); - - for (size_t i = 0; i < m_inputs.size(); i++) - { - ComputationNodePtr child = Input(i); - if (child->m_needsGradient && - (childrenInThisLoop && child->IsPartOfLoop() == IsPartOfLoop() || - childrenInOuterLoop && child->IsPartOfLoop() != IsPartOfLoop())) - { - // fprintf(stderr, "Backprop: %ls %ls operation -> child %d %ls %ls\n", NodeName().c_str(), OperationName().c_str(), (int)i, child->NodeName().c_str(), child->OperationName().c_str()); - if (!m_needsGradient) - LogicError("%ls %ls operation has m_needsGradient set to false but children require it.", NodeName().c_str(), OperationName().c_str()); -#ifdef DISPLAY_DEBUG - fprintf(stderr, " [%lu]: %ls(%ls)\n", i, child->OperationName().c_str(), child->NodeName().c_str()); -#endif -#if DUMPOUTPUT - fprintf(stderr, "Backprop%d_%ls\n", i, NodeName().c_str()); -#endif - child->LazyZeroGradient(); // set gradient to 0 if this is the first time - - // If we propagate from a loop to a node that is outside the loop, we are not efficient. - // This case is handled by SEQTraversalFlowControlNode::Backprop(). - // The check below is to verify that. - if (IsPartOfLoop() && !child->IsPartOfLoop() && !fr.IsAllFrames()) - { - LogicError("Backprop: Inefficiency: %ls %ls operation in loop propagates gradient to non-loop %ls %ls\n", - NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str(), child->OperationName().c_str()); - } - - // fprintf(stderr, "BackpropTo %d %d %ls %ls\n", (int)fr.timeIdxInSeq, (int)i, NodeName().c_str(), OperationName().c_str()); - BackpropTo(i, fr); // this computes partial wrt to the child and sums the gradient value in the child - } -#ifdef DISPLAY_DEBUG - else - fprintf(stderr, " [%lu]: %s(%s) (no gradient needed so don't compute for)\n", i, child->OperationName().c_str(), child->NodeName().c_str()); -#endif - } - } + void Backprop(const FrameRange& fr, bool childrenInThisLoop, bool childrenInOuterLoop) override; // TODO: why of the inputs, and not the node itself? void /*ComputationNodeBase::*/ ZeroGradientsOfInputs() override // clears the lazy-init flags (LazyZeroGradient() actually clears the values lazily) diff --git a/Source/ComputationNetworkLib/NonlinearityNodes.h b/Source/ComputationNetworkLib/NonlinearityNodes.h index a2cd5421d631..a6f8f6e2cb3d 100644 --- a/Source/ComputationNetworkLib/NonlinearityNodes.h +++ b/Source/ComputationNetworkLib/NonlinearityNodes.h @@ -369,25 +369,18 @@ class HardmaxNode : public SoftmaxNodeBase /*ComputationNode*/ /*virtual*/ void BackpropToV(Matrix& gradient, const Matrix& inputFunctionValues, Matrix& inputGradientValues, const Matrix& gradientValues, const Matrix& functionValues) override { - gradient; - inputFunctionValues; - inputGradientValues; - gradientValues; - LogicError("Hardmax is not differentiable and is used for evaluation only."); + gradient; inputFunctionValues; inputGradientValues; gradientValues; + // Hardmax cannot back-propagate a gradient. + // We must not forbid this function to be called, though, since Hardmax may be running + // as part of a recurrent decoding loop. Sequence-to-sequence models run the Hardmax + // node inside the training without back-propagating into them. } - virtual bool OutputUsedInComputingInputNodesGradients() const override - { - return false; - } - virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override - { - return false; - } + virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; } + virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; } /*virtual*/ void ForwardPropV(Matrix& functionValues, const Matrix& inputFunctionValues) override { - // TODO: temp solution, we need to write a math function specifically for this functionValues.AssignHardmaxOf(inputFunctionValues, true); } }; @@ -395,4 +388,4 @@ class HardmaxNode : public SoftmaxNodeBase /*ComputationNode*/ template class HardmaxNode; template class HardmaxNode; -} } } +}}} diff --git a/Source/ComputationNetworkLib/RecurrentNodes.h b/Source/ComputationNetworkLib/RecurrentNodes.h index feb7e485a20b..caa557b570d8 100644 --- a/Source/ComputationNetworkLib/RecurrentNodes.h +++ b/Source/ComputationNetworkLib/RecurrentNodes.h @@ -149,8 +149,11 @@ class DelayedValueNodeBase : public ComputationNode, public IRecurrent Base::Save(fstream); fstream << m_timeStep; - size_t colsDummy = 0; - fstream << GetSampleMatrixNumRows() << colsDummy; // #rows saved for legacy file format +#if CURRENT_CNTK_MODEL_VERSION > CNTK_MODEL_VERSION_3 + m_sampleLayout.Save(fstream); +#else + fstream << (size_t)0 << (size_t)0; // used to be (rows,cols); no need since inferred in Validate(), and wrong for non-matrix tensors +#endif fstream << m_initialActivationValue; } @@ -162,14 +165,21 @@ class DelayedValueNodeBase : public ComputationNode, public IRecurrent fstream >> m_timeStep; - size_t rows, colsDummy; - fstream >> rows >> colsDummy; + if (modelVersion > CNTK_MODEL_VERSION_3) + { + TensorShape sampleLayout; + sampleLayout.Load(fstream); + SetDims(sampleLayout, HasMBLayout() /*may be true on reload (roll-back)*/); + } + else + { + size_t rows, colsDummy; + fstream >> rows >> colsDummy; - // BUGBUG: I got an error in when reloading persistent parameterse for a model that had dimension specified as 0, which did not get re-inferred correctly. - // We should either simply not write this parameter out at all (since it can always be inferred), or write the tensor shape. - if (GetSampleLayout().GetNumElements() != rows) // legacy format: if #rows matches then assume current tensor shape is up to date - SetDims(TensorShape(rows), HasMBLayout() /*may be true on reload (roll-back)*/); // tensor shape will be overwritten in Validate() --TODO: We should serialize it here. - m_delayedValue.Resize(rows, 0); // Note: If we try to access history in first minibatch, we shall crash. It would be a consequence of a missing sentence-begin flag + if (rows != 0 && GetSampleLayout().GetNumElements() != rows) // legacy format: if #rows matches then assume current tensor shape is up to date + SetDims(TensorShape(rows), HasMBLayout() /*may be true on reload (roll-back)*/); // tensor shape will be overwritten in Validate() --TODO: We should serialize it here. + } + m_delayedValue.Resize(m_sampleLayout.GetNumElements(), 0); // Note: If we try to access history in first minibatch, we shall crash. It would be a consequence of a missing sentence-begin flag if (modelVersion >= CNTK_MODEL_VERSION_2) fstream >> m_initialActivationValue; @@ -1171,4 +1181,4 @@ class ShiftNode : public ComputationNode, public IRecurrentNode, publi #endif -} } } +}}} diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h index 7c186e9b421a..def5932c48a8 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.h +++ b/Source/ComputationNetworkLib/ReshapingNodes.h @@ -196,7 +196,7 @@ class ReconcileMBLayoutNode : public ComputationNode, public NumInputs // enforce compatibility of 'dataInput' with 'layoutInput' // TODO: how to deal with boundary flags? if (*m_pMBLayout != *Input(0)->GetMBLayout()) // this does a deep value-level comparison - InvalidArgument("%ls %ls operation discovered that %ls %ls operation produced an MB layout that is incompaitble with that of %ls %ls.", + InvalidArgument("%ls %ls operation discovered that %ls %ls operation produced an MB layout that is incompatible with that of %ls %ls.", NodeName().c_str(), OperationName().c_str(), Input(0)->NodeName().c_str(), Input(0)->OperationName().c_str(), Input(1)->NodeName().c_str(), Input(1)->OperationName().c_str()); From b2f9efb735257211b0d84fcd63548aeaf9b58ad0 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Wed, 16 Mar 2016 08:29:43 -0700 Subject: [PATCH 19/26] bug fix: Gather/Scatter-related nodes must pass actual HasMBLayout() flag during validation --- Source/ComputationNetworkLib/ReshapingNodes.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp index d2860fda66c6..5c2afc243f10 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.cpp +++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp @@ -172,7 +172,7 @@ template if (isFinalValidationPass && Input(INDEXDATA)->GetSampleLayout().GetNumElements() != 1) InvalidArgument("%ls %ls operation requires the second argument (indexData) to be a scalar sequence.", NodeName().c_str(), OperationName().c_str()); - SetDims(Input(INDEXDATA)); + SetDims(Input(INDEXDATA)->GetSampleLayout(), HasMBLayout()); } template class PackedIndexNode; @@ -218,7 +218,7 @@ template InvalidArgument("%ls %ls operation requires the first argument (indexData) to be a scalar sequence.", NodeName().c_str(), OperationName().c_str()); // inherit tensor dimension from sourceData - SetDims(Input(SOURCEDATA)); + SetDims(Input(SOURCEDATA)->GetSampleLayout(), HasMBLayout()); } template class GatherPackedNode; @@ -268,7 +268,7 @@ template // TODO: We also know that indexData and sourceData must have the same MBLayout. But that is checked at runtime. // inherit tensor dimension from sourceData - SetDims(Input(SOURCEDATA)); + SetDims(Input(SOURCEDATA)->GetSampleLayout(), HasMBLayout()); } template class ScatterPackedNode; From b499cc9752d791780ba28590103f49aa164f4a7c Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Wed, 16 Mar 2016 10:03:16 -0700 Subject: [PATCH 20/26] test of Backprop change --- Source/ComputationNetworkLib/ComputationNode.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp index 9d0282875803..955fee00752f 100644 --- a/Source/ComputationNetworkLib/ComputationNode.cpp +++ b/Source/ComputationNetworkLib/ComputationNode.cpp @@ -31,8 +31,8 @@ void ComputationNode::Backprop(const FrameRange& fr, bool childrenInTh // after nodes that propagate outside of the loop, and thus, in the last // time step of the sequence, have not yet received a gradient from a parent // and thus may not have had their gradient matrices allocated. - if (m_needsGradient) - LazyZeroGradient(); // set gradient to 0 if this is the first time + //if (m_needsGradient) + // LazyZeroGradient(); // set gradient to 0 if this is the first time if (fr.IsAllFrames() && IsPartOfLoop() && childrenInThisLoop) LogicError("%ls %ls operation: Backprop called with whole-batch FrameRange on node that participates in a loop", NodeName().c_str(), OperationName().c_str()); From 36566d1bf7d6ba6de33334fe2d5ee78b0cb6c5fb Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Wed, 16 Mar 2016 14:20:14 -0700 Subject: [PATCH 21/26] BrainScriptNetworkBuilder now accepts any BS expression, instead of just a config record for ComputationNetwork. This allows to use 'include' for network definitions and will enable network editing on the fly. --- Source/ActionsLib/NetworkFactory.cpp | 28 +++++++++++++++---- .../CNTK/BrainScript/BrainScriptEvaluator.cpp | 2 +- Source/CNTK/BrainScript/BrainScriptParser.cpp | 21 ++++---------- Source/CNTK/BrainScript/BrainScriptParser.h | 1 + 4 files changed, 29 insertions(+), 23 deletions(-) diff --git a/Source/ActionsLib/NetworkFactory.cpp b/Source/ActionsLib/NetworkFactory.cpp index d02b869829da..5fb4a02a11b5 100644 --- a/Source/ActionsLib/NetworkFactory.cpp +++ b/Source/ActionsLib/NetworkFactory.cpp @@ -75,12 +75,28 @@ function GetNetworkFactory(const ConfigRec // We interface with outer old CNTK config by taking the inner part, which we get as a string, as BrainScript. // We prepend a few standard definitions, and also definition of deviceId and precision, which all objects will pull out again when they are being constructed. // BUGBUG: We are not getting TextLocations right in this way! Do we need to inject location markers into the source? Moot once we fully switch to BS - wstring sourceCode = config.Exists(L"BrainScriptNetworkBuilder") ? config(L"BrainScriptNetworkBuilder") : config(L"ExperimentalNetworkBuilder"); - auto configDirs = ConfigParameters::GetBrainScriptNetworkBuilderIncludePaths(); - let expr = BS::ParseConfigDictFromString(L"include \'cntk.core.bs\'" // Note: Using lowercase here to match the Linux name of the CNTK exe. - + msra::strfun::wstrprintf(L"deviceId = %d ; precision = '%ls' ; network = new ComputationNetwork ", (int)deviceId, ElemTypeName()) - + sourceCode, // source code has the form [ ... ] with brackets in the string - move(configDirs)); // set include paths to all paths that configs were read from; no additional configurable include paths are supported by BrainScriptNetworkBuilder + wstring sourceOfNetwork = config.Exists(L"BrainScriptNetworkBuilder") ? config(L"BrainScriptNetworkBuilder") : config(L"ExperimentalNetworkBuilder"); + if (sourceOfNetwork.find_first_of(L"([") != 0) + InvalidArgument("BrainScript network description must be either a BS expression in ( ) or a config record in [ ]"); + + // set the include paths to all paths that configs were read from; no additional configurable include paths are supported by BrainScriptNetworkBuilder + auto includePaths = ConfigParameters::GetBrainScriptNetworkBuilderIncludePaths(); + + // inject additional items into the source code + // We support two ways of specifying the network in BrainScript: + // - BrainScriptNetworkBuilder = ( any BS expression that evaluates to a ComputationNetwork ) + // - BrainScriptNetworkBuilder = [ constructor parameters for a ComputationNetwork ] + if (sourceOfNetwork[0] == '[') // if [ ] form then we turn it into ComputationNetwork by constructing a ComputationNetwork from it + sourceOfNetwork = L"new ComputationNetwork " + sourceOfNetwork; + let sourceOfBS = msra::strfun::wstrprintf(L"include \'cntk.core.bs\'\n" // include our core lib. Note: Using lowercase here to match the Linux name of the CNTK exe. + L"deviceId = %d\n" // deviceId as passed in + L"precision = '%ls'\n" // 'float' or 'double' + L"network = %ls", // source code of expression that evaluates to a ComputationNetwork + (int)deviceId, ElemTypeName(), sourceOfNetwork.c_str()); + let expr = BS::ParseConfigDictFromString(sourceOfBS, move(includePaths)); + + // the rest is done in a lambda that is only evaluated when a virgin network is needed + // Note that evaluating the BrainScript *is* instantiating the network, so the evaluate call must be inside the lambda. return [expr](DEVICEID_TYPE /*deviceId*/) { // evaluate the parse tree, particularly the top-level field 'network' diff --git a/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp b/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp index d333c1c679eb..5f08733f45ad 100644 --- a/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp +++ b/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp @@ -768,7 +768,7 @@ void Do(ExpressionPtr e) shared_ptr EvaluateField(ExpressionPtr e, const wstring &id) { - return RecordLookup(e, id, e->location, nullptr /*scope for evaluating 'e'*/, L"$"); // we evaluate the member 'do' + return RecordLookup(e, id, e->location, nullptr /*scope for evaluating 'e'*/, L""); // we evaluate the member 'do' } ConfigValuePtr Evaluate(ExpressionPtr e) diff --git a/Source/CNTK/BrainScript/BrainScriptParser.cpp b/Source/CNTK/BrainScript/BrainScriptParser.cpp index dab31e32b782..42489c0a5f57 100644 --- a/Source/CNTK/BrainScript/BrainScriptParser.cpp +++ b/Source/CNTK/BrainScript/BrainScriptParser.cpp @@ -651,24 +651,13 @@ class Parser : public Lexer : Lexer(move(includePaths)) { infixPrecedence = map{ - {L".", 100}, {L"[", 100}, {L"(", 100}, // also sort-of infix operands... - {L"*", 10}, - {L"/", 10}, - {L".*", 10}, - {L"**", 10}, - {L"%", 10}, - {L"+", 9}, - {L"-", 9}, - {L"with", 9}, - {L"==", 8}, - {L"!=", 8}, - {L"<", 8}, - {L"<=", 8}, - {L">", 8}, - {L">=", 8}, + {L".", 99}, {L"[", 99}, {L"(", 99}, // also sort-of infix operands... + {L"*", 10}, {L"/", 10}, {L".*", 10}, {L"**", 10}, {L"%", 10}, + {L"+", 9}, {L"-", 9}, {L"with", 9}, {L"==", 8}, + {L"!=", 8}, {L"<", 8}, {L"<=", 8}, {L">", 8}, {L">=", 8}, {L"&&", 7}, {L"||", 6}, - {L":", 5}, + {L":", 5}, {L"=>", 0}, }; SetSourceFile(move(sourceFile)); diff --git a/Source/CNTK/BrainScript/BrainScriptParser.h b/Source/CNTK/BrainScript/BrainScriptParser.h index 8e307ecca6fd..89dc12cdce73 100644 --- a/Source/CNTK/BrainScript/BrainScriptParser.h +++ b/Source/CNTK/BrainScript/BrainScriptParser.h @@ -135,6 +135,7 @@ typedef Expression::ExpressionPtr ExpressionPtr; // circumvent some circular def // access the parser through one of these functions ExpressionPtr ParseConfigDictFromString(wstring text, vector&& includePaths); // parses a list of dictionary members, returns a dictionary expression +// TODO: These rvalue references are no longer adding value, change to const<>& //ExpressionPtr ParseConfigDictFromFile(wstring path, vector includePaths); // likewise, but from a file path ExpressionPtr ParseConfigExpression(const wstring& sourceText, vector&& includePaths); // parses a single expression from sourceText, which is meant to contain an include statement, hence includePaths From c153a121ee50eccfadea3f91ec63017c5ca1040c Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Wed, 16 Mar 2016 17:07:29 -0700 Subject: [PATCH 22/26] PastValue now serializes tensor dimensions (required to bump the file-format version to 4) --- Source/ComputationNetworkLib/ComputationNode.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index ba0365d05dfd..69f9cd8215d0 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -32,7 +32,8 @@ #define CNTK_MODEL_VERSION_1 1 #define CNTK_MODEL_VERSION_2 2 #define CNTK_MODEL_VERSION_3 3 -#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_3 +#define CNTK_MODEL_VERSION_4 4 // PastValue +#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_4 extern bool g_shareNodeValueMatrices; From b371c5f0fee6cd161d29e4f2458799fe13dbdaba Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Wed, 16 Mar 2016 18:59:57 -0700 Subject: [PATCH 23/26] bug fix: DelayNode must save its dimensions, as inferrence does not work reliably otherwise; temporarily rolled back file-format version to test the above fix --- Source/ComputationNetworkLib/ComputationNode.h | 2 +- Source/ComputationNetworkLib/RecurrentNodes.h | 9 +++++---- Source/ComputationNetworkLib/ReshapingNodes.h | 13 +++++++++++++ 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 69f9cd8215d0..b4a40e65f580 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -33,7 +33,7 @@ #define CNTK_MODEL_VERSION_2 2 #define CNTK_MODEL_VERSION_3 3 #define CNTK_MODEL_VERSION_4 4 // PastValue -#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_4 +#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_3 extern bool g_shareNodeValueMatrices; diff --git a/Source/ComputationNetworkLib/RecurrentNodes.h b/Source/ComputationNetworkLib/RecurrentNodes.h index caa557b570d8..11cd375a5521 100644 --- a/Source/ComputationNetworkLib/RecurrentNodes.h +++ b/Source/ComputationNetworkLib/RecurrentNodes.h @@ -152,7 +152,7 @@ class DelayedValueNodeBase : public ComputationNode, public IRecurrent #if CURRENT_CNTK_MODEL_VERSION > CNTK_MODEL_VERSION_3 m_sampleLayout.Save(fstream); #else - fstream << (size_t)0 << (size_t)0; // used to be (rows,cols); no need since inferred in Validate(), and wrong for non-matrix tensors + fstream << GetSampleLayout().GetNumElements() << (size_t)0; // used to be (rows,cols); no need since inferred in Validate(), and wrong for non-matrix tensors #endif fstream << m_initialActivationValue; @@ -175,9 +175,10 @@ class DelayedValueNodeBase : public ComputationNode, public IRecurrent { size_t rows, colsDummy; fstream >> rows >> colsDummy; - - if (rows != 0 && GetSampleLayout().GetNumElements() != rows) // legacy format: if #rows matches then assume current tensor shape is up to date - SetDims(TensorShape(rows), HasMBLayout() /*may be true on reload (roll-back)*/); // tensor shape will be overwritten in Validate() --TODO: We should serialize it here. + // legacy format: if #rows matches then assume current tensor shape is up to date + // BUGBUG: This fails for non-column tensors. It should be sufficient to set + // these to 0 and rely on Validate(), but some unknown nodes in the loop don't do that right. + SetDims(TensorShape(rows), HasMBLayout() /*may be true on reload (roll-back)*/); // tensor shape will be overwritten in Validate() } m_delayedValue.Resize(m_sampleLayout.GetNumElements(), 0); // Note: If we try to access history in first minibatch, we shall crash. It would be a consequence of a missing sentence-begin flag diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h index def5932c48a8..055c09df6df9 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.h +++ b/Source/ComputationNetworkLib/ReshapingNodes.h @@ -565,6 +565,19 @@ template class RowRepeatNode; // The result will have a different MBLayout reflecting the shortened result sequences. // ----------------------------------------------------------------------- +/* Notes on Where(), PackedIndex(), and Gather-/ScatterPacked(): +This is one of the few nodes that creates new MBLayouts inside this system. +This node is meant to operate jointly with PackedIndexNode. +The difference between Index and PackedIndex is that Index is in human-readable +form referring to indices WITHIN a sequence (since NDL and BS only talk about individual +sequences and never expose anything cross-sequence, except for aggregates like CE or BN. +PackedIndex maps that to the internal lookup table that has strides resolved etc. +The reason that PackedIndex is separate from Gather/ScatterPacked is that the GPU has no +access to the STL-heavy MBLayout. So PackedIndex applies the relevant information from +the MBLayout into a GPU object that then drives the memory-copy operations in Gather() +and Scatter(). +*/ + template class WhereNode : public ComputationNodeNonLooping, public NumInputs<1> { From 577b748634299d191a6436c45d5eedf2ecd4dc15 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Wed, 16 Mar 2016 19:03:19 -0700 Subject: [PATCH 24/26] bumped model version back up to 4 --- Source/ComputationNetworkLib/ComputationNode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index b4a40e65f580..69f9cd8215d0 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -33,7 +33,7 @@ #define CNTK_MODEL_VERSION_2 2 #define CNTK_MODEL_VERSION_3 3 #define CNTK_MODEL_VERSION_4 4 // PastValue -#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_3 +#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_4 extern bool g_shareNodeValueMatrices; From 95878a616322a8fdd8e801e2c9a3c1dfebe26304 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Thu, 17 Mar 2016 12:55:49 -0700 Subject: [PATCH 25/26] (minor comment edits) --- Source/ComputationNetworkLib/LinearAlgebraNodes.h | 8 +++++--- Source/ComputationNetworkLib/ReshapingNodes.h | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Source/ComputationNetworkLib/LinearAlgebraNodes.h b/Source/ComputationNetworkLib/LinearAlgebraNodes.h index bf4b9f22e461..56a471273b8d 100644 --- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h +++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h @@ -389,6 +389,9 @@ template class TimesNode; // Right operand and output can have MB layout, while left operand cannot. // This differs from TimesNode in that A is transposed, where A must be a // rank-1 or rank-2 tensor. +// A common use of transposition is trace(X'X) where X is a matrix of samples. +// This can NOT be implemented with this node. Instead, use +// SumColumnElements (ElementTimes (X, X)) // ----------------------------------------------------------------------- template @@ -698,7 +701,7 @@ template class SumColumnElementsNode; template class SumColumnElementsNode; // ----------------------------------------------------------------------- -// TransposeDimensionsNode (input, dim1, dim2) +// TransposeDimensions (input, dim1, dim2) // - swaps index dimensions dim1 and dim2. The values are 1-based; 1 stands for the leading dimension. // - new dimensions can be created; e.g. a column vector can be transposed into a row vector, which is a [1 x N] tensor // - transposing into the time dimension is currently not supported @@ -710,8 +713,7 @@ template class SumColumnElementsNode; template class TransposeDimensionsNode : public ComputationNode /*ComputationNode*/, public NumInputs<1> { - typedef ComputationNode Base; - UsingComputationNodeMembersBoilerplate; + typedef ComputationNode Base; UsingComputationNodeMembersBoilerplate; static const std::wstring TypeName() { return L"TransposeDimensions"; } public: diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h index 055c09df6df9..7722a283a4e6 100644 --- a/Source/ComputationNetworkLib/ReshapingNodes.h +++ b/Source/ComputationNetworkLib/ReshapingNodes.h @@ -334,7 +334,7 @@ template class RowSliceNode; template class RowSliceNode; // ----------------------------------------------------------------------- -// RowStackNode (input0, input1, ...) +// RowStack (input0, input1, ...) // stacks multiple inputs on top of each other // The inputs will be spliced w.r.t. their first tensor dimension (the "row" dimension). // TODO: This is very close to the planned SpliceNode (just make m_spliceDim actually configurable) except for splicing along time. From 66d7aab044f93b607d0ad767b887dc78c46256a3 Mon Sep 17 00:00:00 2001 From: Marko Radmilac Date: Thu, 17 Mar 2016 14:48:33 -0700 Subject: [PATCH 26/26] Fix run-test permissions --- .../ParallelTraining/NoQuantization/DoublePrecision/run-test | 0 .../ParallelTraining/NoQuantization/SinglePrecision/run-test | 0 Tests/EndToEndTests/ModelExport/Model0/run-test | 0 Tests/EndToEndTests/ModelExport/Model1/run-test | 0 Tests/EndToEndTests/Speech/LSTM/Truncated-Kaldi/run-test | 0 5 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 Tests/EndToEndTests/CNTKTextFormatReader/ParallelTraining/NoQuantization/DoublePrecision/run-test mode change 100644 => 100755 Tests/EndToEndTests/CNTKTextFormatReader/ParallelTraining/NoQuantization/SinglePrecision/run-test mode change 100644 => 100755 Tests/EndToEndTests/ModelExport/Model0/run-test mode change 100644 => 100755 Tests/EndToEndTests/ModelExport/Model1/run-test mode change 100644 => 100755 Tests/EndToEndTests/Speech/LSTM/Truncated-Kaldi/run-test diff --git a/Tests/EndToEndTests/CNTKTextFormatReader/ParallelTraining/NoQuantization/DoublePrecision/run-test b/Tests/EndToEndTests/CNTKTextFormatReader/ParallelTraining/NoQuantization/DoublePrecision/run-test old mode 100644 new mode 100755 diff --git a/Tests/EndToEndTests/CNTKTextFormatReader/ParallelTraining/NoQuantization/SinglePrecision/run-test b/Tests/EndToEndTests/CNTKTextFormatReader/ParallelTraining/NoQuantization/SinglePrecision/run-test old mode 100644 new mode 100755 diff --git a/Tests/EndToEndTests/ModelExport/Model0/run-test b/Tests/EndToEndTests/ModelExport/Model0/run-test old mode 100644 new mode 100755 diff --git a/Tests/EndToEndTests/ModelExport/Model1/run-test b/Tests/EndToEndTests/ModelExport/Model1/run-test old mode 100644 new mode 100755 diff --git a/Tests/EndToEndTests/Speech/LSTM/Truncated-Kaldi/run-test b/Tests/EndToEndTests/Speech/LSTM/Truncated-Kaldi/run-test old mode 100644 new mode 100755